diff --git a/.clang-tidy b/.clang-tidy index 06bb0f18e9d2e..2cda1b81de808 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,3 +1,4 @@ +HeaderFilterRegex: '' Checks: > -*, clang-diagnostic-*, diff --git a/.github/workflows/containers/github-action-ci-tooling/Dockerfile b/.github/workflows/containers/github-action-ci-tooling/Dockerfile index be61264b93753..b78c99efb9be3 100644 --- a/.github/workflows/containers/github-action-ci-tooling/Dockerfile +++ b/.github/workflows/containers/github-action-ci-tooling/Dockerfile @@ -108,6 +108,7 @@ RUN apt-get update && \ abi-compliance-checker \ abi-dumper \ autoconf \ + parallel \ pkg-config && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml index b0c2d32d4a41b..f75dd9c3abd9e 100644 --- a/.github/workflows/llvm-abi-tests.yml +++ b/.github/workflows/llvm-abi-tests.yml @@ -10,13 +10,13 @@ on: - 'release/**' paths: - 'llvm/**' - - '.github/workflows/llvm-tests.yml' + - '.github/workflows/llvm-abi-tests.yml' pull_request: branches: - 'release/**' paths: - 'llvm/**' - - '.github/workflows/llvm-tests.yml' + - '.github/workflows/llvm-abi-tests.yml' concurrency: # Skip intermediate builds: always. @@ -72,6 +72,8 @@ jobs: if: github.repository_owner == 'llvm' needs: abi-dump-setup runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:01e66b0847c1e9c88f0bd0492ed7c3374550a0730b48040f63888393f1ff6c13" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:bb0bd382ab2b" strategy: matrix: name: @@ -87,19 +89,6 @@ jobs: ref: ${{ github.sha }} repo: ${{ github.repository }} steps: - - name: Install Ninja - uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get -y install abi-dumper autoconf pkg-config - - name: Install universal-ctags - run: | - git clone https://github.com/universal-ctags/ctags.git - cd ctags - ./autogen.sh - ./configure - sudo make install - name: Download source code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 with: @@ -143,6 +132,8 @@ jobs: abi-compare: if: github.repository_owner == 'llvm' runs-on: ubuntu-24.04 + container: + image: "ghcr.io/llvm/ci-ubuntu-24.04-abi-tests@sha256:01e66b0847c1e9c88f0bd0492ed7c3374550a0730b48040f63888393f1ff6c13" #ghcr.io/llvm/ci-ubuntu-24.04-abi-tests:bb0bd382ab2b needs: - abi-dump-setup - abi-dump @@ -163,10 +154,6 @@ jobs: name: symbol-list path: symbol-list - - name: Install abi-compliance-checker - run: | - sudo apt-get update - sudo apt-get -y install abi-compliance-checker - name: Compare ABI run: | if [ -s symbol-list/llvm.symbols ]; then diff --git a/.github/workflows/test-unprivileged-download-artifact.yml b/.github/workflows/test-unprivileged-download-artifact.yml new file mode 100644 index 0000000000000..a9c0912b0f44e --- /dev/null +++ b/.github/workflows/test-unprivileged-download-artifact.yml @@ -0,0 +1,54 @@ +name: Test Unprivileged Download Artifact Action + +permissions: + contents: read + +on: + push: + branches: + - main + paths: + - .github/workflows/test-unprivileged-download-artifact.yml + - '.github/workflows/unprivileged-download-artifact/**' + pull_request: + paths: + - .github/workflows/test-unprivileged-download-artifact.yml + - '.github/workflows/unprivileged-download-artifact/**' + +jobs: + upload-test-artifact: + name: Upload Test Artifact + if: github.repository_owner == 'llvm' + runs-on: ubuntu-24.04 + steps: + - name: Create Test File + run: | + echo "test" > comment + - name: Upload Test File + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: workflow-args + path: | + comment + + test-download: + name: Test Unprivileged Download Artifact + if: github.repository_owner == 'llvm' + runs-on: ubuntu-24.04 + needs: [ upload-test-artifact ] + steps: + - name: Chekcout LLVM + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + sparse-checkout: | + .github/workflows/unprivileged-download-artifact/action.yml + - name: Download Artifact + uses: ./.github/workflows/unprivileged-download-artifact + id: download-artifact + with: + run-id: ${{ github.run_id }} + artifact-name: workflow-args + - name: Assert That Contents are the Same + run: | + cat comment + [[ "$(cat comment)" == "test" ]] diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h index 7c8ea12ee3ee3..faf7bb62c6bee 100644 --- a/bolt/include/bolt/Core/DebugData.h +++ b/bolt/include/bolt/Core/DebugData.h @@ -471,6 +471,12 @@ class DebugStrOffsetsWriter { return std::move(StrOffsetsBuffer); } + /// Returns strings of .debug_str_offsets. + StringRef getBufferStr() { + return StringRef(reinterpret_cast(StrOffsetsBuffer->data()), + StrOffsetsBuffer->size()); + } + /// Initializes Buffer and Stream. void initialize(DWARFUnit &Unit); @@ -507,6 +513,12 @@ class DebugStrWriter { return std::move(StrBuffer); } + /// Returns strings of .debug_str. + StringRef getBufferStr() { + return StringRef(reinterpret_cast(StrBuffer->data()), + StrBuffer->size()); + } + /// Adds string to .debug_str. /// On first invocation it initializes internal data structures. uint32_t addString(StringRef Str); diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp index 5e3fa931e826f..816acb229fec5 100644 --- a/bolt/lib/Rewrite/DWARFRewriter.cpp +++ b/bolt/lib/Rewrite/DWARFRewriter.cpp @@ -1723,7 +1723,76 @@ StringRef getSectionName(const SectionRef &Section) { return Name; } -// Extracts an appropriate slice if input is DWP. +/// Extracts the slice of the .debug_str.dwo section for a given CU from a DWP +/// file, based on the .debug_str_offsets.dwo section. This helps address DWO +/// bloat that may occur after updates. +/// +/// A slice of .debug_str.dwo may be composed of several non-contiguous +/// fragments. These non-contiguous string views will be written out +/// sequentially, avoiding the copying overhead caused by assembling them. +/// +/// The .debug_str_offsets for the first CU often does not need to be updated, +/// so copying is only performed when .debug_str_offsets requires updating. +static void UpdateStrAndStrOffsets(StringRef StrDWOContent, + StringRef StrOffsetsContent, + SmallVectorImpl &StrDWOOutData, + std::string &StrOffsetsOutData, + unsigned DwarfVersion, bool IsLittleEndian) { + const llvm::endianness Endian = + IsLittleEndian ? llvm::endianness::little : llvm::endianness::big; + const uint64_t HeaderOffset = (DwarfVersion >= 5) ? 8 : 0; + constexpr size_t SizeOfOffset = sizeof(int32_t); + const uint64_t NumOffsets = + (StrOffsetsContent.size() - HeaderOffset) / SizeOfOffset; + + DataExtractor Extractor(StrOffsetsContent, IsLittleEndian, 0); + uint64_t ExtractionOffset = HeaderOffset; + + using StringFragment = DWARFUnitIndex::Entry::SectionContribution; + const auto getStringLength = [](StringRef Content, + uint64_t Offset) -> uint64_t { + size_t NullPos = Content.find('\0', Offset); + return (NullPos != StringRef::npos) ? (NullPos - Offset + 1) : 0; + }; + const auto isContiguous = [](const StringFragment &Fragment, + uint64_t NextOffset) -> bool { + return NextOffset == Fragment.getOffset() + Fragment.getLength(); + }; + std::optional CurrentFragment; + uint64_t AccumulatedStrLen = 0; + for (uint64_t I = 0; I < NumOffsets; ++I) { + const uint64_t StrOffset = Extractor.getU32(&ExtractionOffset); + const uint64_t StringLength = getStringLength(StrDWOContent, StrOffset); + if (!CurrentFragment) { + // First init. + CurrentFragment = StringFragment(StrOffset, StringLength); + } else { + if (isContiguous(*CurrentFragment, StrOffset)) { + // Expanding the current fragment. + CurrentFragment->setLength(CurrentFragment->getLength() + StringLength); + } else { + // Saving the current fragment and start a new one. + StrDWOOutData.push_back(StrDWOContent.substr( + CurrentFragment->getOffset(), CurrentFragment->getLength())); + CurrentFragment = StringFragment(StrOffset, StringLength); + } + } + if (AccumulatedStrLen != StrOffset) { + // Updating str offsets. + if (StrOffsetsOutData.empty()) + StrOffsetsOutData = StrOffsetsContent.str(); + llvm::support::endian::write32( + &StrOffsetsOutData[HeaderOffset + I * SizeOfOffset], + static_cast(AccumulatedStrLen), Endian); + } + AccumulatedStrLen += StringLength; + } + if (CurrentFragment) + StrDWOOutData.push_back(StrDWOContent.substr(CurrentFragment->getOffset(), + CurrentFragment->getLength())); +} + +// Exctracts an appropriate slice if input is DWP. // Applies patches or overwrites the section. std::optional updateDebugData( DWARFContext &DWCtx, StringRef SectionName, StringRef SectionContents, @@ -1772,6 +1841,8 @@ std::optional updateDebugData( errs() << "BOLT-WARNING: unsupported debug section: " << SectionName << "\n"; if (StrWriter.isInitialized()) { + if (CUDWOEntry) + return StrWriter.getBufferStr(); OutputBuffer = StrWriter.releaseBuffer(); return StringRef(reinterpret_cast(OutputBuffer->data()), OutputBuffer->size()); @@ -1786,6 +1857,8 @@ std::optional updateDebugData( } case DWARFSectionKind::DW_SECT_STR_OFFSETS: { if (StrOffstsWriter.isFinalized()) { + if (CUDWOEntry) + return StrOffstsWriter.getBufferStr(); OutputBuffer = StrOffstsWriter.releaseBuffer(); return StringRef(reinterpret_cast(OutputBuffer->data()), OutputBuffer->size()); @@ -1888,6 +1961,10 @@ void DWARFRewriter::writeDWOFiles( } } + StringRef StrDWOContent; + StringRef StrOffsetsContent; + llvm::SmallVector StrDWOOutData; + std::string StrOffsetsOutData; for (const SectionRef &Section : File->sections()) { std::unique_ptr OutputData; StringRef SectionName = getSectionName(Section); @@ -1895,11 +1972,50 @@ void DWARFRewriter::writeDWOFiles( continue; Expected ContentsExp = Section.getContents(); assert(ContentsExp && "Invalid contents."); + if (IsDWP && SectionName == "debug_str.dwo") { + if (StrWriter.isInitialized()) + StrDWOContent = StrWriter.getBufferStr(); + else + StrDWOContent = *ContentsExp; + continue; + } if (std::optional OutData = updateDebugData( (*DWOCU)->getContext(), SectionName, *ContentsExp, KnownSections, *Streamer, *this, CUDWOEntry, DWOId, OutputData, RangeListssWriter, - LocWriter, StrOffstsWriter, StrWriter, OverridenSections)) + LocWriter, StrOffstsWriter, StrWriter, OverridenSections)) { + if (IsDWP && SectionName == "debug_str_offsets.dwo") { + StrOffsetsContent = *OutData; + continue; + } Streamer->emitBytes(*OutData); + } + } + + if (IsDWP) { + // Handling both .debug_str.dwo and .debug_str_offsets.dwo concurrently. In + // the original DWP, .debug_str is a deduplicated global table, and the + // .debug_str.dwo slice for a single CU needs to be extracted according to + // .debug_str_offsets.dwo. + UpdateStrAndStrOffsets(StrDWOContent, StrOffsetsContent, StrDWOOutData, + StrOffsetsOutData, CU.getVersion(), + (*DWOCU)->getContext().isLittleEndian()); + auto SectionIter = KnownSections.find("debug_str.dwo"); + if (SectionIter != KnownSections.end()) { + Streamer->switchSection(SectionIter->second.first); + for (size_t i = 0; i < StrDWOOutData.size(); ++i) { + StringRef OutData = StrDWOOutData[i]; + if (!OutData.empty()) + Streamer->emitBytes(OutData); + } + } + SectionIter = KnownSections.find("debug_str_offsets.dwo"); + if (SectionIter != KnownSections.end()) { + Streamer->switchSection(SectionIter->second.first); + if (!StrOffsetsOutData.empty()) + Streamer->emitBytes(StrOffsetsOutData); + else + Streamer->emitBytes(StrOffsetsContent); + } } Streamer->finish(); TempOut->keep(); diff --git a/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s b/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s new file mode 100644 index 0000000000000..cc951b689a5c6 --- /dev/null +++ b/bolt/test/X86/Inputs/dwarf4-str-split-dwarf.s @@ -0,0 +1,330 @@ +#--- main.s +# clang++ -g2 -gdwarf-4 -gsplit-dwarf=split -gno-pubnames -S main.cpp +# extern int getReturn(); +# int main() { +# return getReturn(); +# } + .file "main.cpp" + .globl main # -- Begin function main + .type main,@function +main: # @main +.Lfunc_begin0: + .file 1 "." "main.cpp" + .loc 1 2 0 # main.cpp:2:0 + .loc 1 3 10 prologue_end # main.cpp:3:10 + .loc 1 3 3 epilogue_begin is_stmt 0 # main.cpp:3:3 + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .ascii "\260B" # DW_AT_GNU_dwo_name + .byte 14 # DW_FORM_strp + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .ascii "\263B" # DW_AT_GNU_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lskel_string0 # DW_AT_comp_dir + .long .Lskel_string1 # DW_AT_GNU_dwo_name + .quad -9094791692727444213 # DW_AT_GNU_dwo_id + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_GNU_addr_base +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "main.dwo" # string offset=2 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "main" # string offset=0 +.Linfo_string1: + .asciz "int" # string offset=5 +.Linfo_string2: + .asciz "clang version 22.0.0" # string offset=9 +.Linfo_string3: + .asciz "main.cpp" # string offset=30 +.Linfo_string4: + .asciz "main.dwo" # string offset=39 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 5 + .long 9 + .long 30 + .long 39 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 4 # DWARF version number + .long 0 # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x22 DW_TAG_compile_unit + .byte 2 # DW_AT_producer + .short 33 # DW_AT_language + .byte 3 # DW_AT_name + .byte 4 # DW_AT_GNU_dwo_name + .quad -9094791692727444213 # DW_AT_GNU_dwo_id + .byte 2 # Abbrev [2] 0x19:0xf DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + .long 40 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x28:0x4 DW_TAG_base_type + .byte 1 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\260B" # DW_AT_GNU_dwo_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .ascii "\201>" # DW_FORM_GNU_addr_index + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits +.Laddr_table_base0: + .quad .Lfunc_begin0 + .ident "clang version 22.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym _Z9getReturnv + .section .debug_line,"",@progbits +.Lline_table_start0: +#--- helper.s +# clang++ -g2 -gdwarf-4 -gsplit-dwarf=split -gno-pubnames -S helper.cpp +# int getReturn() { +# return 0; +# } + .file "helper.cpp" + .globl _Z9getReturnv # -- Begin function _Z9getReturnv + .type _Z9getReturnv,@function +_Z9getReturnv: # @_Z9getReturnv +.Lfunc_begin0: + .file 1 "." "helper.cpp" + .loc 1 1 0 # helper.cpp:1:0 + .loc 1 2 3 prologue_end # helper.cpp:2:3 + .loc 1 2 3 epilogue_begin is_stmt 0 # helper.cpp:2:3 + retq +.Lfunc_end0: + .size _Z9getReturnv, .Lfunc_end0-_Z9getReturnv + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .ascii "\260B" # DW_AT_GNU_dwo_name + .byte 14 # DW_FORM_strp + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .ascii "\263B" # DW_AT_GNU_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lskel_string0 # DW_AT_comp_dir + .long .Lskel_string1 # DW_AT_GNU_dwo_name + .quad 5976014880088676049 # DW_AT_GNU_dwo_id + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_GNU_addr_base +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "helper.dwo" # string offset=2 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "_Z9getReturnv" # string offset=0 +.Linfo_string1: + .asciz "getReturn" # string offset=14 +.Linfo_string2: + .asciz "int" # string offset=24 +.Linfo_string3: + .asciz "clang version 22.0.0" # string offset=28 +.Linfo_string4: + .asciz "helper.cpp" # string offset=49 +.Linfo_string5: + .asciz "helper.dwo" # string offset=60 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 14 + .long 24 + .long 28 + .long 49 + .long 60 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 4 # DWARF version number + .long 0 # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x23 DW_TAG_compile_unit + .byte 3 # DW_AT_producer + .short 33 # DW_AT_language + .byte 4 # DW_AT_name + .byte 5 # DW_AT_GNU_dwo_name + .quad 5976014880088676049 # DW_AT_GNU_dwo_id + .byte 2 # Abbrev [2] 0x19:0x10 DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_linkage_name + .byte 1 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 41 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x29:0x4 DW_TAG_base_type + .byte 2 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\260B" # DW_AT_GNU_dwo_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .ascii "\201>" # DW_FORM_GNU_addr_index + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 110 # DW_AT_linkage_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits +.Laddr_table_base0: + .quad .Lfunc_begin0 + .ident "clang version 22.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s b/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s new file mode 100644 index 0000000000000..5e938ea98bf95 --- /dev/null +++ b/bolt/test/X86/Inputs/dwarf5-str-split-dwarf.s @@ -0,0 +1,368 @@ +#--- main.s +# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -gno-pubnames -S main.cpp +# extern int getReturn(); +# int main() { +# return getReturn(); +# } + .file "main.cpp" + .globl main # -- Begin function main + .type main,@function +main: # @main +.Lfunc_begin0: + .file 0 "." "main.cpp" md5 0x9cdef858e26cf684ed9ef3b60e05bdad + .loc 0 2 0 # main.cpp:2:0 + .loc 0 3 10 prologue_end # main.cpp:3:10 + .loc 0 3 3 epilogue_begin is_stmt 0 # main.cpp:3:3 + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 74 # DW_TAG_skeleton_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 114 # DW_AT_str_offsets_base + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 115 # DW_AT_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 4 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .quad -9094791692727444213 + .byte 1 # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lstr_offsets_base0 # DW_AT_str_offsets_base + .byte 0 # DW_AT_comp_dir + .byte 1 # DW_AT_dwo_name + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_addr_base +.Ldebug_info_end0: + .section .debug_str_offsets,"",@progbits + .long 12 # Length of String Offsets Set + .short 5 + .short 0 +.Lstr_offsets_base0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "main.dwo" # string offset=2 + .section .debug_str_offsets,"",@progbits + .long .Lskel_string0 + .long .Lskel_string1 + .section .debug_str_offsets.dwo,"e",@progbits + .long 24 # Length of String Offsets Set + .short 5 + .short 0 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "main" # string offset=0 +.Linfo_string1: + .asciz "int" # string offset=5 +.Linfo_string2: + .asciz "clang version 22.0.0" # string offset=9 +.Linfo_string3: + .asciz "main.cpp" # string offset=30 +.Linfo_string4: + .asciz "main.dwo" # string offset=39 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 5 + .long 9 + .long 30 + .long 39 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 5 # DWARF version number + .byte 5 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long 0 # Offset Into Abbrev. Section + .quad -9094791692727444213 + .byte 1 # Abbrev [1] 0x14:0x1a DW_TAG_compile_unit + .byte 2 # DW_AT_producer + .short 33 # DW_AT_language + .byte 3 # DW_AT_name + .byte 4 # DW_AT_dwo_name + .byte 2 # Abbrev [2] 0x1a:0xf DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + .long 41 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x29:0x4 DW_TAG_base_type + .byte 1 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 37 # DW_FORM_strx1 + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits + .long .Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution +.Ldebug_addr_start0: + .short 5 # DWARF version number + .byte 8 # Address size + .byte 0 # Segment selector size +.Laddr_table_base0: + .quad .Lfunc_begin0 +.Ldebug_addr_end0: + .ident "clang version 22.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym _Z9getReturnv + .section .debug_line,"",@progbits +.Lline_table_start0: +#--- helper.s +# clang++ -g2 -gdwarf-5 -gsplit-dwarf=split -gno-pubnames -S helper.cpp +# int getReturn() { +# return 0; +# } + .file "helper.cpp" + .globl _Z9getReturnv # -- Begin function _Z9getReturnv + .type _Z9getReturnv,@function +_Z9getReturnv: # @_Z9getReturnv +.Lfunc_begin0: + .file 0 "." "helper.cpp" md5 0xc7d7879297b54325c71b3e0cfbb65e2d + .loc 0 1 0 # helper.cpp:1:0 + .loc 0 2 3 prologue_end # helper.cpp:2:3 + .loc 0 2 3 epilogue_begin is_stmt 0 # helper.cpp:2:3 + retq +.Lfunc_end0: + .size _Z9getReturnv, .Lfunc_end0-_Z9getReturnv + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 74 # DW_TAG_skeleton_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 114 # DW_AT_str_offsets_base + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 115 # DW_AT_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 4 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .quad 5976014880088676049 + .byte 1 # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lstr_offsets_base0 # DW_AT_str_offsets_base + .byte 0 # DW_AT_comp_dir + .byte 1 # DW_AT_dwo_name + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_addr_base +.Ldebug_info_end0: + .section .debug_str_offsets,"",@progbits + .long 12 # Length of String Offsets Set + .short 5 + .short 0 +.Lstr_offsets_base0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "helper.dwo" # string offset=2 + .section .debug_str_offsets,"",@progbits + .long .Lskel_string0 + .long .Lskel_string1 + .section .debug_str_offsets.dwo,"e",@progbits + .long 28 # Length of String Offsets Set + .short 5 + .short 0 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "_Z9getReturnv" # string offset=0 +.Linfo_string1: + .asciz "getReturn" # string offset=14 +.Linfo_string2: + .asciz "int" # string offset=24 +.Linfo_string3: + .asciz "clang version 22.0.0" # string offset=28 +.Linfo_string4: + .asciz "helper.cpp" # string offset=49 +.Linfo_string5: + .asciz "helper.dwo" # string offset=60 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 14 + .long 24 + .long 28 + .long 49 + .long 60 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 5 # DWARF version number + .byte 5 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long 0 # Offset Into Abbrev. Section + .quad 5976014880088676049 + .byte 1 # Abbrev [1] 0x14:0x1b DW_TAG_compile_unit + .byte 3 # DW_AT_producer + .short 33 # DW_AT_language + .byte 4 # DW_AT_name + .byte 5 # DW_AT_dwo_name + .byte 2 # Abbrev [2] 0x1a:0x10 DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_linkage_name + .byte 1 # DW_AT_name + .byte 0 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 42 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x2a:0x4 DW_TAG_base_type + .byte 2 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 37 # DW_FORM_strx1 + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 110 # DW_AT_linkage_name + .byte 37 # DW_FORM_strx1 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits + .long .Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution +.Ldebug_addr_start0: + .short 5 # DWARF version number + .byte 8 # Address size + .byte 0 # Segment selector size +.Laddr_table_base0: + .quad .Lfunc_begin0 +.Ldebug_addr_end0: + .ident "clang version 22.0.0" + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test b/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test new file mode 100644 index 0000000000000..a0e8721374a87 --- /dev/null +++ b/bolt/test/X86/dwarf4-str-dwp-input-dwo-output.test @@ -0,0 +1,76 @@ +; RUN: split-file %p/Inputs/dwarf4-str-split-dwarf.s %t +; RUN: cd %t +; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \ +; RUN: --filetype=obj main.s -o=main.o +; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \ +; RUN: --filetype=obj helper.s -o=helper.o +; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe +; RUN: llvm-dwp -e main.exe -o main.exe.dwp +; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.exe.dwp \ +; RUN: | FileCheck -check-prefix=PRE-BOLT-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.exe.dwp \ +; RUN: | FileCheck -check-prefix=PRE-BOLT-STR-OFFSETS %s +; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections +; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR-OFFSETS %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str helper.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets helper.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR-OFFSETS %s + +;; For DWARF4, this test checks that strings are split correctly from a combined +;; section in DWP file, into appropriate .dwo files. + +; PRE-BOLT-STR: 0x00000000: "main" +; PRE-BOLT-STR: 0x00000005: "int" +; PRE-BOLT-STR: 0x00000009: "clang version 22.0.0" +; PRE-BOLT-STR: 0x0000001e: "main.cpp" +; PRE-BOLT-STR: 0x00000027: "main.dwo" +; PRE-BOLT-STR: 0x00000030: "_Z9getReturnv" +; PRE-BOLT-STR: 0x0000003e: "getReturn" +; PRE-BOLT-STR: 0x00000048: "helper.cpp" +; PRE-BOLT-STR: 0x00000053: "helper.dwo" + +; PRE-BOLT-STR-OFFSETS: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 4 +; PRE-BOLT-STR-OFFSETS: 0x00000000: 00000000 "main" +; PRE-BOLT-STR-OFFSETS: 0x00000004: 00000005 "int" +; PRE-BOLT-STR-OFFSETS: 0x00000008: 00000009 "clang version 22.0.0" +; PRE-BOLT-STR-OFFSETS: 0x0000000c: 0000001e "main.cpp" +; PRE-BOLT-STR-OFFSETS: 0x00000010: 00000027 "main.dwo" +; PRE-BOLT-STR-OFFSETS: 0x00000014: Contribution size = 24, Format = DWARF32, Version = 4 +; PRE-BOLT-STR-OFFSETS: 0x00000014: 00000030 "_Z9getReturnv" +; PRE-BOLT-STR-OFFSETS: 0x00000018: 0000003e "getReturn" +; PRE-BOLT-STR-OFFSETS: 0x0000001c: 00000005 "int" +; PRE-BOLT-STR-OFFSETS: 0x00000020: 00000009 "clang version 22.0.0" +; PRE-BOLT-STR-OFFSETS: 0x00000024: 00000048 "helper.cpp" +; PRE-BOLT-STR-OFFSETS: 0x00000028: 00000053 "helper.dwo" + +; BOLT-MAIN-STR: 0x00000000: "main" +; BOLT-MAIN-STR: 0x00000005: "int" +; BOLT-MAIN-STR: 0x00000009: "clang version 22.0.0" +; BOLT-MAIN-STR: 0x0000001e: "main.cpp" +; BOLT-MAIN-STR: 0x00000027: "main.dwo" + +; BOLT-MAIN-STR-OFFSETS: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 4 +; BOLT-MAIN-STR-OFFSETS: 0x00000000: 00000000 "main" +; BOLT-MAIN-STR-OFFSETS: 0x00000004: 00000005 "int" +; BOLT-MAIN-STR-OFFSETS: 0x00000008: 00000009 "clang version 22.0.0" +; BOLT-MAIN-STR-OFFSETS: 0x0000000c: 0000001e "main.cpp" +; BOLT-MAIN-STR-OFFSETS: 0x00000010: 00000027 "main.dwo" + +; BOLT-HELPER-STR: 0x00000000: "_Z9getReturnv" +; BOLT-HELPER-STR: 0x0000000e: "getReturn" +; BOLT-HELPER-STR: 0x00000018: "int" +; BOLT-HELPER-STR: 0x0000001c: "clang version 22.0.0" +; BOLT-HELPER-STR: 0x00000031: "helper.cpp" +; BOLT-HELPER-STR: 0x0000003c: "helper.dwo" + +; BOLT-HELPER-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 4 +; BOLT-HELPER-STR-OFFSETS: 0x00000000: 00000000 "_Z9getReturnv" +; BOLT-HELPER-STR-OFFSETS: 0x00000004: 0000000e "getReturn" +; BOLT-HELPER-STR-OFFSETS: 0x00000008: 00000018 "int" +; BOLT-HELPER-STR-OFFSETS: 0x0000000c: 0000001c "clang version 22.0.0" +; BOLT-HELPER-STR-OFFSETS: 0x00000010: 00000031 "helper.cpp" +; BOLT-HELPER-STR-OFFSETS: 0x00000014: 0000003c "helper.dwo" diff --git a/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test b/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test new file mode 100644 index 0000000000000..2e72c6a808924 --- /dev/null +++ b/bolt/test/X86/dwarf5-str-dwp-input-dwo-output.test @@ -0,0 +1,76 @@ +; RUN: split-file %p/Inputs/dwarf5-str-split-dwarf.s %t +; RUN: cd %t +; RUN: llvm-mc --split-dwarf-file=main.dwo --triple=x86_64-unknown-linux-gnu \ +; RUN: --filetype=obj main.s -o=main.o +; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \ +; RUN: --filetype=obj helper.s -o=helper.o +; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe +; RUN: llvm-dwp -e main.exe -o main.exe.dwp +; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.exe.dwp \ +; RUN: | FileCheck -check-prefix=PRE-BOLT-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.exe.dwp \ +; RUN: | FileCheck -check-prefix=PRE-BOLT-STR-OFFSETS %s +; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections +; RUN: llvm-dwarfdump --show-form --verbose --debug-str main.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets main.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-MAIN-STR-OFFSETS %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str helper.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR %s +; RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets helper.dwo.dwo \ +; RUN: | FileCheck -check-prefix=BOLT-HELPER-STR-OFFSETS %s + +;; For DWARF5, this test checks that strings are split correctly from a combined +;; section in DWP file, into appropriate .dwo files. + +; PRE-BOLT-STR: 0x00000000: "main" +; PRE-BOLT-STR: 0x00000005: "int" +; PRE-BOLT-STR: 0x00000009: "clang version 22.0.0" +; PRE-BOLT-STR: 0x0000001e: "main.cpp" +; PRE-BOLT-STR: 0x00000027: "main.dwo" +; PRE-BOLT-STR: 0x00000030: "_Z9getReturnv" +; PRE-BOLT-STR: 0x0000003e: "getReturn" +; PRE-BOLT-STR: 0x00000048: "helper.cpp" +; PRE-BOLT-STR: 0x00000053: "helper.dwo" + +; PRE-BOLT-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 5 +; PRE-BOLT-STR-OFFSETS: 0x00000008: 00000000 "main" +; PRE-BOLT-STR-OFFSETS: 0x0000000c: 00000005 "int" +; PRE-BOLT-STR-OFFSETS: 0x00000010: 00000009 "clang version 22.0.0" +; PRE-BOLT-STR-OFFSETS: 0x00000014: 0000001e "main.cpp" +; PRE-BOLT-STR-OFFSETS: 0x00000018: 00000027 "main.dwo" +; PRE-BOLT-STR-OFFSETS: 0x0000001c: Contribution size = 28, Format = DWARF32, Version = 5 +; PRE-BOLT-STR-OFFSETS: 0x00000024: 00000030 "_Z9getReturnv" +; PRE-BOLT-STR-OFFSETS: 0x00000028: 0000003e "getReturn" +; PRE-BOLT-STR-OFFSETS: 0x0000002c: 00000005 "int" +; PRE-BOLT-STR-OFFSETS: 0x00000030: 00000009 "clang version 22.0.0" +; PRE-BOLT-STR-OFFSETS: 0x00000034: 00000048 "helper.cpp" +; PRE-BOLT-STR-OFFSETS: 0x00000038: 00000053 "helper.dwo" + +; BOLT-MAIN-STR: 0x00000000: "main" +; BOLT-MAIN-STR: 0x00000005: "int" +; BOLT-MAIN-STR: 0x00000009: "clang version 22.0.0" +; BOLT-MAIN-STR: 0x0000001e: "main.cpp" +; BOLT-MAIN-STR: 0x00000027: "main.dwo" + +; BOLT-MAIN-STR-OFFSETS: 0x00000000: Contribution size = 24, Format = DWARF32, Version = 5 +; BOLT-MAIN-STR-OFFSETS: 0x00000008: 00000000 "main" +; BOLT-MAIN-STR-OFFSETS: 0x0000000c: 00000005 "int" +; BOLT-MAIN-STR-OFFSETS: 0x00000010: 00000009 "clang version 22.0.0" +; BOLT-MAIN-STR-OFFSETS: 0x00000014: 0000001e "main.cpp" +; BOLT-MAIN-STR-OFFSETS: 0x00000018: 00000027 "main.dwo" + +; BOLT-HELPER-STR: 0x00000000: "_Z9getReturnv" +; BOLT-HELPER-STR: 0x0000000e: "getReturn" +; BOLT-HELPER-STR: 0x00000018: "int" +; BOLT-HELPER-STR: 0x0000001c: "clang version 22.0.0" +; BOLT-HELPER-STR: 0x00000031: "helper.cpp" +; BOLT-HELPER-STR: 0x0000003c: "helper.dwo" + +; BOLT-HELPER-STR-OFFSETS: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5 +; BOLT-HELPER-STR-OFFSETS: 0x00000008: 00000000 "_Z9getReturnv" +; BOLT-HELPER-STR-OFFSETS: 0x0000000c: 0000000e "getReturn" +; BOLT-HELPER-STR-OFFSETS: 0x00000010: 00000018 "int" +; BOLT-HELPER-STR-OFFSETS: 0x00000014: 0000001c "clang version 22.0.0" +; BOLT-HELPER-STR-OFFSETS: 0x00000018: 00000031 "helper.cpp" +; BOLT-HELPER-STR-OFFSETS: 0x0000001c: 0000003c "helper.dwo" diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h index 11086fb4bfda1..62bf42da4f9f9 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h @@ -25,7 +25,8 @@ class UncheckedOptionalAccessCheck : public ClangTidyCheck { public: UncheckedOptionalAccessCheck(StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), - ModelOptions{Options.get("IgnoreSmartPointerDereference", false)} {} + ModelOptions{Options.get("IgnoreSmartPointerDereference", false), + Options.get("IgnoreValueCalls", false)} {} void registerMatchers(ast_matchers::MatchFinder *Finder) override; void check(const ast_matchers::MatchFinder::MatchResult &Result) override; bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { @@ -34,6 +35,7 @@ class UncheckedOptionalAccessCheck : public ClangTidyCheck { void storeOptions(ClangTidyOptions::OptionMap &Opts) override { Options.store(Opts, "IgnoreSmartPointerDereference", ModelOptions.IgnoreSmartPointerDereference); + Options.store(Opts, "IgnoreValueCalls", ModelOptions.IgnoreValueCalls); } private: diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 2628a26acdf5e..4283fe0b5ea69 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -367,7 +367,11 @@ Changes in existing checks - Improved :doc:`bugprone-unchecked-optional-access ` check by supporting ``NullableValue::makeValue`` and ``NullableValue::makeValueInplace`` to - prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type. + prevent false-positives for ``BloombergLP::bdlb::NullableValue`` type, and by + adding the `IgnoreValueCalls` option to suppress diagnostics for + ``optional::value()`` and the `IgnoreSmartPointerDereference` option to + ignore optionals reached via smart-pointer-like dereference, while still + diagnosing UB-prone dereferences via ``operator*`` and ``operator->``. - Improved :doc:`bugprone-unhandled-self-assignment ` check by adding diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst index 552e6db699696..ebed79e339d4b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unchecked-optional-access.rst @@ -308,3 +308,22 @@ advantages: * Performance. A single check can cover many or even all accesses within scope. This gives the user the best of both worlds -- the safety of a dynamic check, but without incurring redundant costs. + +Options +------- + +.. option:: IgnoreSmartPointerDereference + + If set to `true`, the check ignores optionals that + are reached through overloaded smart-pointer-like dereference (``operator*``, + ``operator->``) on classes other than the optional type itself. This helps + avoid false positives where the analysis cannot equate results across such + calls. This does not cover access through ``operator[]``. Default is `false`. + +.. option:: IgnoreValueCalls + + If set to `true`, the check does not diagnose calls + to ``optional::value()``. Diagnostics for ``operator*()`` and + ``operator->()`` remain enabled. This is useful for codebases that + intentionally rely on ``value()`` for defined, guarded access while still + flagging UB-prone operator dereferences. Default is `false`. diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp new file mode 100644 index 0000000000000..f54621269f8c0 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unchecked-optional-access-ignore-value.cpp @@ -0,0 +1,25 @@ +// RUN: %check_clang_tidy %s bugprone-unchecked-optional-access %t -- \ +// RUN: -config="{CheckOptions: \ +// RUN: {bugprone-unchecked-optional-access.IgnoreValueCalls: true}}" -- \ +// RUN: -I %S/Inputs/unchecked-optional-access + +#include "absl/types/optional.h" + +struct Foo { + void foo() const {} +}; + +void unchecked_value_access(const absl::optional &opt) { + opt.value(); // no-warning +} + +void unchecked_deref_operator_access(const absl::optional &opt) { + *opt; + // CHECK-MESSAGES: :[[@LINE-1]]:4: warning: unchecked access to optional value +} + +void unchecked_arrow_operator_access(const absl::optional &opt) { + opt->foo(); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: unchecked access to optional value +} + diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5f356daec2d04..c095a59bb978d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -844,6 +844,9 @@ RISC-V Support - Add `-march=unset` to clear any previous `-march=` value. This ISA string will be computed from `-mcpu` or the platform default. +- `__GCC_CONSTRUCTIVE_SIZE` and `__GCC_DESTRUCTIVE_SIZE` are changed to 64. These values are + unstable according to `Clang's documentation `_. + CUDA/HIP Language Changes ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h index 696c9f4a6cf5c..c547d6ce2e387 100644 --- a/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h +++ b/clang/include/clang/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.h @@ -46,6 +46,9 @@ struct UncheckedOptionalAccessModelOptions { /// are confident in this const accessor caching, we shouldn't need the /// IgnoreSmartPointerDereference option anymore. bool IgnoreSmartPointerDereference = false; + + /// In generating diagnostics, ignore calls to `optional::value()`. + bool IgnoreValueCalls = false; }; using UncheckedOptionalAccessLattice = CachedConstAccessorsLattice; diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index f68422c6eb01d..1243380ca8a6b 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -5432,8 +5432,7 @@ bool Compiler::VisitCXXThisExpr(const CXXThisExpr *E) { unsigned EndIndex = 0; // Find the init list. for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) { - if (InitStack[StartIndex].Kind == InitLink::K_InitList || - InitStack[StartIndex].Kind == InitLink::K_This) { + if (InitStack[StartIndex].Kind == InitLink::K_DIE) { EndIndex = StartIndex; --StartIndex; break; @@ -5446,7 +5445,8 @@ bool Compiler::VisitCXXThisExpr(const CXXThisExpr *E) { continue; if (InitStack[StartIndex].Kind != InitLink::K_Field && - InitStack[StartIndex].Kind != InitLink::K_Elem) + InitStack[StartIndex].Kind != InitLink::K_Elem && + InitStack[StartIndex].Kind != InitLink::K_DIE) break; } @@ -5457,7 +5457,8 @@ bool Compiler::VisitCXXThisExpr(const CXXThisExpr *E) { // Emit the instructions. for (unsigned I = StartIndex; I != (EndIndex + 1); ++I) { - if (InitStack[I].Kind == InitLink::K_InitList) + if (InitStack[I].Kind == InitLink::K_InitList || + InitStack[I].Kind == InitLink::K_DIE) continue; if (!InitStack[I].template emit(this, E)) return false; @@ -6328,8 +6329,8 @@ bool Compiler::compileConstructor(const CXXConstructorDecl *Ctor) { unsigned FirstLinkOffset = R->getField(cast(IFD->chain()[0]))->Offset; - InitStackScope ISS(this, isa(InitExpr)); InitLinkScope ILS(this, InitLink::Field(FirstLinkOffset)); + InitStackScope ISS(this, isa(InitExpr)); if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr, IsUnion)) return false; diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index 5c46f75af4da3..0c6cab9276531 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -52,12 +52,14 @@ struct InitLink { K_Decl = 3, K_Elem = 5, K_RVO = 6, - K_InitList = 7 + K_InitList = 7, + K_DIE = 8, }; static InitLink This() { return InitLink{K_This}; } static InitLink InitList() { return InitLink{K_InitList}; } static InitLink RVO() { return InitLink{K_RVO}; } + static InitLink DIE() { return InitLink{K_DIE}; } static InitLink Field(unsigned Offset) { InitLink IL{K_Field}; IL.Offset = Offset; @@ -668,22 +670,29 @@ template class InitLinkScope final { ~InitLinkScope() { this->Ctx->InitStack.pop_back(); } -private: +public: Compiler *Ctx; }; template class InitStackScope final { public: InitStackScope(Compiler *Ctx, bool Active) - : Ctx(Ctx), OldValue(Ctx->InitStackActive) { + : Ctx(Ctx), OldValue(Ctx->InitStackActive), Active(Active) { Ctx->InitStackActive = Active; + if (Active) + Ctx->InitStack.push_back(InitLink::DIE()); } - ~InitStackScope() { this->Ctx->InitStackActive = OldValue; } + ~InitStackScope() { + this->Ctx->InitStackActive = OldValue; + if (Active) + Ctx->InitStack.pop_back(); + } private: Compiler *Ctx; bool OldValue; + bool Active; }; } // namespace interp diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp index 0fa333eedcfdd..d90f5d4eaf7bb 100644 --- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp +++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp @@ -1153,26 +1153,34 @@ auto buildDiagnoseMatchSwitch( // FIXME: Evaluate the efficiency of matchers. If using matchers results in a // lot of duplicated work (e.g. string comparisons), consider providing APIs // that avoid it through memoization. - auto IgnorableOptional = ignorableOptional(Options); - return CFGMatchSwitchBuilder< - const Environment, - llvm::SmallVector>() - // optional::value - .CaseOfCFGStmt( - valueCall(IgnorableOptional), - [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &, - const Environment &Env) { - return diagnoseUnwrapCall(E->getImplicitObjectArgument(), Env); - }) - - // optional::operator*, optional::operator-> - .CaseOfCFGStmt(valueOperatorCall(IgnorableOptional), - [](const CallExpr *E, + const auto IgnorableOptional = ignorableOptional(Options); + + auto DiagBuilder = + CFGMatchSwitchBuilder< + const Environment, + llvm::SmallVector>() + // optional::operator*, optional::operator-> + .CaseOfCFGStmt( + valueOperatorCall(IgnorableOptional), + [](const CallExpr *E, const MatchFinder::MatchResult &, + const Environment &Env) { + return diagnoseUnwrapCall(E->getArg(0), Env); + }); + + auto Builder = Options.IgnoreValueCalls + ? std::move(DiagBuilder) + : std::move(DiagBuilder) + // optional::value + .CaseOfCFGStmt( + valueCall(IgnorableOptional), + [](const CXXMemberCallExpr *E, const MatchFinder::MatchResult &, const Environment &Env) { - return diagnoseUnwrapCall(E->getArg(0), Env); - }) - .Build(); + return diagnoseUnwrapCall( + E->getImplicitObjectArgument(), Env); + }); + + return std::move(Builder).Build(); } } // namespace diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 2dec26ecacf26..5e9da245e2b43 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -534,7 +534,7 @@ WarningsSpecialCaseList::create(const llvm::MemoryBuffer &Input, void WarningsSpecialCaseList::processSections(DiagnosticsEngine &Diags) { static constexpr auto WarningFlavor = clang::diag::Flavor::WarningOrError; for (const auto &SectionEntry : sections()) { - StringRef DiagGroup = SectionEntry.SectionStr; + StringRef DiagGroup = SectionEntry.name(); if (DiagGroup == "*") { // Drop the default section introduced by special case list, we only // support exact diagnostic group names. diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp index 9cb118893a0d9..8727057eb78d1 100644 --- a/clang/lib/Basic/ProfileList.cpp +++ b/clang/lib/Basic/ProfileList.cpp @@ -36,7 +36,7 @@ class ProfileSpecialCaseList : public llvm::SpecialCaseList { bool hasPrefix(StringRef Prefix) const { for (const auto &It : sections()) - if (It.Entries.count(Prefix) > 0) + if (It.hasPrefix(Prefix)) return true; return false; } diff --git a/clang/lib/Basic/SanitizerSpecialCaseList.cpp b/clang/lib/Basic/SanitizerSpecialCaseList.cpp index 56f551628cf89..928c086898097 100644 --- a/clang/lib/Basic/SanitizerSpecialCaseList.cpp +++ b/clang/lib/Basic/SanitizerSpecialCaseList.cpp @@ -42,7 +42,7 @@ void SanitizerSpecialCaseList::createSanitizerSections() { SanitizerMask Mask; #define SANITIZER(NAME, ID) \ - if (S.SectionMatcher.matchAny(NAME)) \ + if (S.matchName(NAME)) \ Mask |= SanitizerKind::ID; #define SANITIZER_GROUP(NAME, ID, ALIAS) SANITIZER(NAME, ID) @@ -68,7 +68,7 @@ SanitizerSpecialCaseList::inSectionBlame(SanitizerMask Mask, StringRef Prefix, if (S.Mask & Mask) { unsigned LineNum = S.S.getLastMatch(Prefix, Query, Category); if (LineNum > 0) - return {S.S.FileIdx, LineNum}; + return {S.S.fileIndex(), LineNum}; } } return NotFound; diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h index 85fa4cc07dccf..21555b94fe65d 100644 --- a/clang/lib/Basic/Targets/RISCV.h +++ b/clang/lib/Basic/Targets/RISCV.h @@ -126,7 +126,7 @@ class RISCVTargetInfo : public TargetInfo { llvm::APInt getFMVPriority(ArrayRef Features) const override; std::pair hardwareInterferenceSizes() const override { - return std::make_pair(32, 32); + return std::make_pair(64, 64); } bool supportsCpuSupports() const override { return getTriple().isOSLinux(); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 68a83561d7cb1..b088251e0cf3c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -10083,19 +10083,44 @@ static llvm::Value *emitDeviceID( return DeviceID; } -static llvm::Value *emitDynCGGroupMem(const OMPExecutableDirective &D, - CodeGenFunction &CGF) { - llvm::Value *DynCGroupMem = CGF.Builder.getInt32(0); - - if (auto *DynMemClause = D.getSingleClause()) { - CodeGenFunction::RunCleanupsScope DynCGroupMemScope(CGF); - llvm::Value *DynCGroupMemVal = CGF.EmitScalarExpr( - DynMemClause->getSize(), /*IgnoreResultAssign=*/true); - DynCGroupMem = CGF.Builder.CreateIntCast(DynCGroupMemVal, CGF.Int32Ty, - /*isSigned=*/false); +static std::pair +emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) { + llvm::Value *DynGP = CGF.Builder.getInt32(0); + auto DynGPFallback = OMPDynGroupprivateFallbackType::Abort; + + if (auto *DynGPClause = D.getSingleClause()) { + CodeGenFunction::RunCleanupsScope DynGPScope(CGF); + llvm::Value *DynGPVal = + CGF.EmitScalarExpr(DynGPClause->getSize(), /*IgnoreResultAssign=*/true); + DynGP = CGF.Builder.CreateIntCast(DynGPVal, CGF.Int32Ty, + /*isSigned=*/false); + auto FallbackModifier = DynGPClause->getDynGroupprivateFallbackModifier(); + switch (FallbackModifier) { + case OMPC_DYN_GROUPPRIVATE_FALLBACK_abort: + DynGPFallback = OMPDynGroupprivateFallbackType::Abort; + break; + case OMPC_DYN_GROUPPRIVATE_FALLBACK_null: + DynGPFallback = OMPDynGroupprivateFallbackType::Null; + break; + case OMPC_DYN_GROUPPRIVATE_FALLBACK_default_mem: + case OMPC_DYN_GROUPPRIVATE_FALLBACK_unknown: + // This is the default for dyn_groupprivate. + DynGPFallback = OMPDynGroupprivateFallbackType::DefaultMem; + break; + default: + llvm_unreachable("Unknown fallback modifier for OpenMP dyn_groupprivate"); + } + } else if (auto *OMPXDynCGClause = + D.getSingleClause()) { + CodeGenFunction::RunCleanupsScope DynCGMemScope(CGF); + llvm::Value *DynCGMemVal = CGF.EmitScalarExpr(OMPXDynCGClause->getSize(), + /*IgnoreResultAssign=*/true); + DynGP = CGF.Builder.CreateIntCast(DynCGMemVal, CGF.Int32Ty, + /*isSigned=*/false); } - return DynCGroupMem; + return {DynGP, DynGPFallback}; } + static void genMapInfoForCaptures( MappableExprsHandler &MEHandler, CodeGenFunction &CGF, const CapturedStmt &CS, llvm::SmallVectorImpl &CapturedVars, @@ -10640,7 +10665,7 @@ static void emitTargetCallKernelLaunch( llvm::Value *RTLoc = OMPRuntime->emitUpdateLocation(CGF, D.getBeginLoc()); llvm::Value *NumIterations = OMPRuntime->emitTargetNumIterationsCall(CGF, D, SizeEmitter); - llvm::Value *DynCGGroupMem = emitDynCGGroupMem(D, CGF); + auto [DynCGroupMem, DynCGroupMemFallback] = emitDynCGroupMem(D, CGF); llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( CGF.AllocaInsertPt->getParent(), CGF.AllocaInsertPt->getIterator()); @@ -10650,7 +10675,7 @@ static void emitTargetCallKernelLaunch( llvm::OpenMPIRBuilder::TargetKernelArgs Args( NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads, - DynCGGroupMem, HasNoWait); + DynCGroupMem, HasNoWait, DynCGroupMemFallback); llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch( diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h index 86123727a1bc3..0226fe95abab6 100644 --- a/clang/lib/Headers/__clang_cuda_device_functions.h +++ b/clang/lib/Headers/__clang_cuda_device_functions.h @@ -528,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); } __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; -__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); } +__DEVICE__ __attribute__((noreturn)) void __trap(void) { __builtin_trap(); } __DEVICE__ unsigned short __usAtomicCAS(unsigned short *__p, unsigned short __cmp, unsigned short __v) { return __nvvm_atom_cas_gen_us(__p, __cmp, __v); diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c index 3360d4f725b24..bffd557ff77a6 100644 --- a/clang/test/AST/ByteCode/c.c +++ b/clang/test/AST/ByteCode/c.c @@ -387,3 +387,8 @@ void bar2(void) { int a[2][3][4][5]; // all-note {{array 'a' declared here}} foo2(&a[0][4]); // all-warning {{array index 4 is past the end of the array}} } + +void plainComplex(void) { + _Complex cd; // all-warning {{_Complex double}} + cd = *(_Complex *)&(struct { double r, i; }){0.0, 0.0}; // all-warning {{_Complex double}} +} diff --git a/clang/test/AST/ByteCode/cxx14.cpp b/clang/test/AST/ByteCode/cxx14.cpp index 9622311e100cb..57cb42ea4a98b 100644 --- a/clang/test/AST/ByteCode/cxx14.cpp +++ b/clang/test/AST/ByteCode/cxx14.cpp @@ -7,3 +7,24 @@ constexpr int(*null_ptr)() = nullptr; constexpr int test4 = (*null_ptr)(); // both-error {{must be initialized by a constant expression}} \ // both-note {{evaluates to a null function pointer}} +struct E { + int n = 0; + struct { + void *x = this; + }; + void *y = this; +}; +constexpr E e1 = E(); +static_assert(e1.x != e1.y, ""); +constexpr E e2 = E{0}; +static_assert(e2.x != e2.y, ""); + +struct S { + int &&a = 2; + int b[1]{a}; +}; +constexpr int foo() { + S s{12}; + return s.b[0]; +} +static_assert(foo() == 12, ""); diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp index 3fb78dc871904..4eab3999dfc42 100644 --- a/clang/test/CIR/CodeGen/complex.cpp +++ b/clang/test/CIR/CodeGen/complex.cpp @@ -1495,3 +1495,42 @@ void calling_function_that_return_complex() { // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 // OGCG: store float %[[RESULT_REAL]], ptr %[[A_REAL_PTR]], align 4 // OGCG: store float %[[RESULT_IMAG]], ptr %[[A_IMAG_PTR]], align 4 + +void imag_literal_gnu_extension() { + float _Complex a = 3.0fi; + double _Complex b = 3.0i; + int _Complex c = 3i; +} + +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a", init] +// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR: %[[C_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["c", init] +// CIR: %[[COMPLEX_A:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.float, #cir.fp<3.000000e+00> : !cir.float> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_A]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR: %[[COMPLEX_B:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<3.000000e+00> : !cir.double> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_B]], %[[B_ADDR]] : !cir.complex, !cir.ptr> +// CIR: %[[COMPLEX_C:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<3> : !s32i> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX_C]], %[[C_ADDR]] : !cir.complex, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { double, double }, i64 1, align 8 +// LLVM: %[[C_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: store { float, float } { float 0.000000e+00, float 3.000000e+00 }, ptr %[[A_ADDR]], align 4 +// LLVM: store { double, double } { double 0.000000e+00, double 3.000000e+00 }, ptr %[[B_ADDR]], align 8 +// LLVM: store { i32, i32 } { i32 0, i32 3 }, ptr %[[C_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { double, double }, align 8 +// OGCG: %[[C_ADDR:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store float 0.000000e+00, ptr %[[A_REAL_PTR]], align 4 +// OGCG: store float 3.000000e+00, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[B_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[B_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[B_ADDR]], i32 0, i32 1 +// OGCG: store double 0.000000e+00, ptr %[[B_REAL_PTR]], align 8 +// OGCG: store double 3.000000e+00, ptr %[[B_IMAG_PTR]], align 8 +// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 0 +// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[C_ADDR]], i32 0, i32 1 +// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4 +// OGCG: store i32 3, ptr %[[C_IMAG_PTR]], align 4 diff --git a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames index 329cea9a0adfb..2452ee345fe2f 100644 --- a/clang/test/CodeGen/Inputs/basic-block-sections.funcnames +++ b/clang/test/CodeGen/Inputs/basic-block-sections.funcnames @@ -1 +1,3 @@ -!world +v1 +f world +c 0 diff --git a/clang/test/CodeGen/basic-block-sections.c b/clang/test/CodeGen/basic-block-sections.c index a61b8dd4ac376..0c21a4cb1442c 100644 --- a/clang/test/CodeGen/basic-block-sections.c +++ b/clang/test/CodeGen/basic-block-sections.c @@ -30,8 +30,10 @@ int another(int a) { // // BB_WORLD: .section .text.world,"ax",@progbits{{$}} // BB_WORLD: world: -// BB_WORLD: .section .text.world,"ax",@progbits,unique -// BB_WORLD: world.__part.1: +// BB_ALL: .section .text.world,"ax",@progbits,unique +// BB_ALL: world.__part.1: +// BB_LIST: .section .text.split.world,"ax",@progbits +// BB_LIST: world.cold: // BB_ALL: .section .text.another,"ax",@progbits // BB_ALL: another.__part.1: // BB_LIST-NOT: .section .text.another,"ax",@progbits diff --git a/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp new file mode 100644 index 0000000000000..758f35d629ace --- /dev/null +++ b/clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp @@ -0,0 +1,2633 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK1 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK3 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK3 + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" + +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK9 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK9 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK11 +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK11 + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=61 -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" + + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + + + + +// We have 6 target regions + + + +// Check target registration is registered as a Ctor. + + +template +tx ftemplate(int n) { + tx a = 0; + + #pragma omp target teams dyn_groupprivate(tx(20)) + { + } + + short b = 1; + #pragma omp target teams num_teams(b) dyn_groupprivate(1024) + { + a += b; + } + + return a; +} + +static +int fstatic(int n) { + + #pragma omp target teams distribute parallel for simd num_teams(n) dyn_groupprivate(n*32) + for (int i = 0; i < n ; ++i) { + } + + #pragma omp target teams dyn_groupprivate(fallback(default_mem): 32+n) nowait + { + } + + return n+1; +} + +struct S1 { + double a; + + int r1(int n){ + int b = 1; + + #pragma omp target teams dyn_groupprivate(fallback(null): n-b) + { + this->a = (double)b + 1.5; + } + + #pragma omp target dyn_groupprivate(fallback(abort): 1024) + { + this->a = 2.5; + } + + return (int)a; + } +}; + +int bar(int n){ + int a = 0; + + S1 S; + a += S.r1(n); + + a += fstatic(n); + + a += ftemplate(n); + + return a; +} + + + + + + + + + + + + + + + + + + + + + +// Check that the offloading functions are emitted and that the parallel function +// is appropriately guarded. + + + + + + +#endif + +// CHECK1-LABEL: define {{[^@]+}}@_Z3bari +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 8 +// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[CALL:%.*]] = call noundef signext i32 @_ZN2S12r1Ei(ptr noundef nonnull align 8 dereferenceable(8) [[S]], i32 noundef signext [[TMP0]]) +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[CALL1:%.*]] = call noundef signext i32 @_ZL7fstatici(i32 noundef signext [[TMP2]]) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]] +// CHECK1-NEXT: store i32 [[ADD2]], ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[CALL3:%.*]] = call noundef signext i32 @_Z9ftemplateIiET_i(i32 noundef signext [[TMP4]]) +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]] +// CHECK1-NEXT: store i32 [[ADD4]], ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: ret i32 [[TMP6]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZN2S12r1Ei +// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[THIS:%.*]], i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i32 1, ptr [[B]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]] +// CHECK1-NEXT: store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[B]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[B_CASTED]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[THIS1]], ptr [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[A]], ptr [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP3]], ptr [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP3]], ptr [[TMP10]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP5]], ptr [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP5]], ptr [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP18]], align 4 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK1-NEXT: store i32 3, ptr [[TMP19]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP15]], ptr [[TMP20]], align 8 +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP16]], ptr [[TMP21]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK1-NEXT: store ptr @.offload_sizes, ptr [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes, ptr [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK1-NEXT: store i64 4, ptr [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK1-NEXT: store i32 [[TMP17]], ptr [[TMP30]], align 4 +// CHECK1-NEXT: [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]]) +// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK1-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[THIS1]], ptr [[TMP33]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK1-NEXT: store ptr [[A2]], ptr [[TMP34]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP35]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP38]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK1-NEXT: store i32 1, ptr [[TMP39]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP36]], ptr [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP37]], ptr [[TMP41]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK1-NEXT: store ptr @.offload_sizes.1, ptr [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP43]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP44]], align 8 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP45]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP46]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK1-NEXT: store i64 0, ptr [[TMP47]], align 8 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK1-NEXT: store i32 1024, ptr [[TMP50]], align 4 +// CHECK1-NEXT: [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK1-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK1-NEXT: br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK1: omp_offload.failed7: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK1: omp_offload.cont8: +// CHECK1-NEXT: [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP53:%.*]] = load double, ptr [[A9]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = fptosi double [[TMP53]] to i32 +// CHECK1-NEXT: ret i32 [[CONV]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@_ZL7fstatici +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 8 +// CHECK1-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32 +// CHECK1-NEXT: store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP4]], ptr [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP4]], ptr [[TMP10]], align 8 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP11]], align 8 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP6]], ptr [[TMP12]], align 8 +// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP6]], ptr [[TMP13]], align 8 +// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store ptr null, ptr [[TMP14]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP8]], ptr [[TMP15]], align 8 +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP8]], ptr [[TMP16]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CHECK1-NEXT: store ptr null, ptr [[TMP17]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK1-NEXT: [[TMP24:%.*]] = zext i32 [[ADD]] to i64 +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP27]], align 4 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK1-NEXT: store i32 3, ptr [[TMP28]], align 4 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP18]], ptr [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP19]], ptr [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK1-NEXT: store ptr @.offload_sizes.3, ptr [[TMP31]], align 8 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP32]], align 8 +// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP33]], align 8 +// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP34]], align 8 +// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK1-NEXT: store i64 [[TMP24]], ptr [[TMP35]], align 8 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK1-NEXT: store i64 8, ptr [[TMP36]], align 8 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK1-NEXT: store i32 [[TMP25]], ptr [[TMP39]], align 4 +// CHECK1-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]]) +// CHECK1-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK1-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 32, [[TMP42]] +// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 +// CHECK1-NEXT: [[TMP44:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 8 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP44]], ptr [[TMP45]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP44]], ptr [[TMP46]], align 8 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP47]], align 8 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK1-NEXT: store i32 [[TMP51]], ptr [[TMP50]], align 4 +// CHECK1-NEXT: [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 64, i64 4, ptr @.omp_task_entry., i64 -1) +// CHECK1-NEXT: [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 8 +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i64 4, i1 false) +// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0 +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP57]], ptr align 8 [[TMP48]], i64 8, i1 false) +// CHECK1-NEXT: [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1 +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP58]], ptr align 8 [[TMP49]], i64 8, i1 false) +// CHECK1-NEXT: [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2 +// CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP59]], ptr align 8 @.offload_sizes.5, i64 8, i1 false) +// CHECK1-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]]) +// CHECK1-NEXT: [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1 +// CHECK1-NEXT: ret i32 [[ADD12]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i +// CHECK1-SAME: (i32 noundef signext [[N:%.*]]) #[[ATTR0]] comdat { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK1-NEXT: [[B:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 8 +// CHECK1-NEXT: [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK1-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[A]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP0]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK1-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK1-NEXT: store ptr null, ptr [[TMP2]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK1-NEXT: store ptr null, ptr [[TMP3]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK1-NEXT: store ptr null, ptr [[TMP4]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK1-NEXT: store ptr null, ptr [[TMP5]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP7]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK1-NEXT: store i64 8, ptr [[TMP9]], align 8 +// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4 +// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK1-NEXT: store i32 20, ptr [[TMP12]], align 4 +// CHECK1-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]]) +// CHECK1-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK1-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK1: omp_offload.failed: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK1: omp_offload.cont: +// CHECK1-NEXT: store i16 1, ptr [[B]], align 2 +// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[B]], align 2 +// CHECK1-NEXT: store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: store i32 [[TMP16]], ptr [[A_CASTED]], align 4 +// CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[A_CASTED]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i16, ptr [[B]], align 2 +// CHECK1-NEXT: store i16 [[TMP18]], ptr [[B_CASTED]], align 2 +// CHECK1-NEXT: [[TMP19:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK1-NEXT: [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK1-NEXT: store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2 +// CHECK1-NEXT: [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP22]], align 8 +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: store i64 [[TMP17]], ptr [[TMP23]], align 8 +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK1-NEXT: store ptr null, ptr [[TMP24]], align 8 +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP19]], ptr [[TMP25]], align 8 +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK1-NEXT: store i64 [[TMP19]], ptr [[TMP26]], align 8 +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 1 +// CHECK1-NEXT: store ptr null, ptr [[TMP27]], align 8 +// CHECK1-NEXT: [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP21]], ptr [[TMP28]], align 8 +// CHECK1-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK1-NEXT: store i64 [[TMP21]], ptr [[TMP29]], align 8 +// CHECK1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 2 +// CHECK1-NEXT: store ptr null, ptr [[TMP30]], align 8 +// CHECK1-NEXT: [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK1-NEXT: [[TMP34:%.*]] = sext i16 [[TMP33]] to i32 +// CHECK1-NEXT: [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0 +// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0 +// CHECK1-NEXT: store i32 3, ptr [[TMP36]], align 4 +// CHECK1-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1 +// CHECK1-NEXT: store i32 3, ptr [[TMP37]], align 4 +// CHECK1-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP31]], ptr [[TMP38]], align 8 +// CHECK1-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP32]], ptr [[TMP39]], align 8 +// CHECK1-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4 +// CHECK1-NEXT: store ptr @.offload_sizes.7, ptr [[TMP40]], align 8 +// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP41]], align 8 +// CHECK1-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP42]], align 8 +// CHECK1-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP43]], align 8 +// CHECK1-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP44]], align 8 +// CHECK1-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9 +// CHECK1-NEXT: store i64 8, ptr [[TMP45]], align 8 +// CHECK1-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4 +// CHECK1-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4 +// CHECK1-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12 +// CHECK1-NEXT: store i32 1024, ptr [[TMP48]], align 4 +// CHECK1-NEXT: [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]]) +// CHECK1-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK1-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]] +// CHECK1: omp_offload.failed2: +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT3]] +// CHECK1: omp_offload.cont3: +// CHECK1-NEXT: [[TMP51:%.*]] = load i32, ptr [[A]], align 4 +// CHECK1-NEXT: ret i32 [[TMP51]] +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88 +// CHECK1-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double +// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK1-NEXT: store double [[ADD]], ptr [[A]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93 +// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK1-NEXT: store double 2.500000e+00, ptr [[A]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71 +// CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK1-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8 +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +// CHECK1-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP22]]) +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0 +// CHECK1-NEXT: br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0 +// CHECK1-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1 +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1 +// CHECK1-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK1-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK1-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK1-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK1: omp.precond.then: +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV:%.*]] = trunc i64 [[TMP4]] to i32 +// CHECK1-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK1-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK1-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK1-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK1-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK1: cond.true: +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK1-NEXT: br label [[COND_END:%.*]] +// CHECK1: cond.false: +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: br label [[COND_END]] +// CHECK1: cond.end: +// CHECK1-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK1-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK1-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK1: omp.inner.for.cond: +// CHECK1-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK1-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK1: omp.inner.for.body: +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK1: omp.body.continue: +// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK1: omp.inner.for.inc: +// CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK1-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +// CHECK1: omp.inner.for.end: +// CHECK1-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK1: omp.loop.exit: +// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +// CHECK1-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]]) +// CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK1-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK1-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK1: .omp.final.then: +// CHECK1-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK1-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK1-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK1-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK1-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK1-NEXT: store i32 [[ADD11]], ptr [[I4]], align 4 +// CHECK1-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK1: .omp.final.done: +// CHECK1-NEXT: br label [[OMP_PRECOND_END]] +// CHECK1: omp.precond.end: +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75 +// CHECK1-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map. +// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 8 +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1 +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTADDR2]], align 8 +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[TMP8]], align 8 +// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTADDR3]], align 8 +// CHECK1-NEXT: store ptr [[TMP9]], ptr [[TMP10]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry. +// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK1-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1 +// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]]) +// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]]) +// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]]) +// CHECK1-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]]) +// CHECK1-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META33:![0-9]+]] +// CHECK1-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]] +// CHECK1-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK1-NEXT: store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 +// CHECK1-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 +// CHECK1-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 +// CHECK1-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 +// CHECK1-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 +// CHECK1-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 +// CHECK1-NEXT: store ptr null, ptr [[TMP21]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 +// CHECK1-NEXT: store ptr null, ptr [[TMP22]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 +// CHECK1-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 +// CHECK1-NEXT: store i64 9, ptr [[TMP24]], align 8, !noalias [[META33]] +// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 +// CHECK1-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 +// CHECK1-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]]) +// CHECK1-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK1-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] +// CHECK1: omp_offload.failed.i: +// CHECK1-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK1-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META33]] +// CHECK1-NEXT: [[TMP31:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias [[META33]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i64 [[TMP31]]) #[[ATTR2]] +// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] +// CHECK1: .omp_outlined..exit: +// CHECK1-NEXT: ret i32 0 +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK1-SAME: () #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60 +// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK1-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2 +// CHECK1-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK1-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0) +// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK1-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 +// CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK1-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2 +// CHECK1-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]]) +// CHECK1-NEXT: ret void +// +// +// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK1-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK1-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4 +// CHECK1-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@_Z3bari +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[S:%.*]] = alloca [[STRUCT_S1:%.*]], align 4 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[CALL:%.*]] = call noundef i32 @_ZN2S12r1Ei(ptr noundef nonnull align 4 dereferenceable(8) [[S]], i32 noundef [[TMP0]]) +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CALL]] +// CHECK3-NEXT: store i32 [[ADD]], ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[CALL1:%.*]] = call noundef i32 @_ZL7fstatici(i32 noundef [[TMP2]]) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP3]], [[CALL1]] +// CHECK3-NEXT: store i32 [[ADD2]], ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[CALL3:%.*]] = call noundef i32 @_Z9ftemplateIiET_i(i32 noundef [[TMP4]]) +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP5]], [[CALL3]] +// CHECK3-NEXT: store i32 [[ADD4]], ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: ret i32 [[TMP6]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@_ZN2S12r1Ei +// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS3:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS4:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS5:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[KERNEL_ARGS6:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 1, ptr [[B]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]] +// CHECK3-NEXT: store i32 [[SUB]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[B]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr [[THIS1]], ptr [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr [[A]], ptr [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP5]], ptr [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP5]], ptr [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP18]], align 4 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 3, ptr [[TMP19]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP15]], ptr [[TMP20]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP16]], ptr [[TMP21]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK3-NEXT: store ptr @.offload_sizes, ptr [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes, ptr [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP26]], align 8 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK3-NEXT: store i64 4, ptr [[TMP27]], align 8 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.region_id, ptr [[KERNEL_ARGS]]) +// CHECK3-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 +// CHECK3-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK3: omp_offload.failed: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK3: omp_offload.cont: +// CHECK3-NEXT: [[A2:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK3-NEXT: store ptr [[THIS1]], ptr [[TMP33]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK3-NEXT: store ptr [[A2]], ptr [[TMP34]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS5]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP35]], align 4 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS3]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP38]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 1 +// CHECK3-NEXT: store i32 1, ptr [[TMP39]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP36]], ptr [[TMP40]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP37]], ptr [[TMP41]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 4 +// CHECK3-NEXT: store ptr @.offload_sizes.1, ptr [[TMP42]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes.2, ptr [[TMP43]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP44]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP45]], align 4 +// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP46]], align 8 +// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 9 +// CHECK3-NEXT: store i64 0, ptr [[TMP47]], align 8 +// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP48]], align 4 +// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP49]], align 4 +// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS6]], i32 0, i32 12 +// CHECK3-NEXT: store i32 1024, ptr [[TMP50]], align 4 +// CHECK3-NEXT: [[TMP51:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93.region_id, ptr [[KERNEL_ARGS6]]) +// CHECK3-NEXT: [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0 +// CHECK3-NEXT: br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]] +// CHECK3: omp_offload.failed7: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93(ptr [[THIS1]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT8]] +// CHECK3: omp_offload.cont8: +// CHECK3-NEXT: [[A9:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP53:%.*]] = load double, ptr [[A9]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = fptosi double [[TMP53]] to i32 +// CHECK3-NEXT: ret i32 [[CONV]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@_ZL7fstatici +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_4:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED8:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS9:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS10:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS11:%.*]] = alloca [1 x ptr], align 4 +// CHECK3-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP2]], 32 +// CHECK3-NEXT: store i32 [[MUL]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP7]], ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED2]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP4]], ptr [[TMP9]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP4]], ptr [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP6]], ptr [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP6]], ptr [[TMP13]], align 4 +// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CHECK3-NEXT: store ptr null, ptr [[TMP14]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP8]], ptr [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP8]], ptr [[TMP16]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr null, ptr [[TMP17]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP21]], ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP22]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB5:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_4]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], 1 +// CHECK3-NEXT: [[TMP24:%.*]] = zext i32 [[ADD]] to i64 +// CHECK3-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP20]], 0 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 3, ptr [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP18]], ptr [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP19]], ptr [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK3-NEXT: store ptr @.offload_sizes.3, ptr [[TMP31]], align 4 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes.4, ptr [[TMP32]], align 4 +// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP33]], align 4 +// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP34]], align 4 +// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK3-NEXT: store i64 [[TMP24]], ptr [[TMP35]], align 8 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK3-NEXT: store i64 8, ptr [[TMP36]], align 8 +// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] [[TMP26]], ptr [[TMP37]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP38]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK3-NEXT: store i32 [[TMP25]], ptr [[TMP39]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP20]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.region_id, ptr [[KERNEL_ARGS]]) +// CHECK3-NEXT: [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0 +// CHECK3-NEXT: br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK3: omp_offload.failed: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71(i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK3: omp_offload.cont: +// CHECK3-NEXT: [[TMP42:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[ADD7:%.*]] = add nsw i32 32, [[TMP42]] +// CHECK3-NEXT: store i32 [[ADD7]], ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK3-NEXT: store i32 [[TMP43]], ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED8]], align 4 +// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP44]], ptr [[TMP45]], align 4 +// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP44]], ptr [[TMP46]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS11]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP47]], align 4 +// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS9]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP49:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS10]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP50:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP51:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4 +// CHECK3-NEXT: store i32 [[TMP51]], ptr [[TMP50]], align 4 +// CHECK3-NEXT: [[TMP52:%.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 36, i32 4, ptr @.omp_task_entry., i64 -1) +// CHECK3-NEXT: [[TMP53:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP52]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP54:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP53]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP55:%.*]] = load ptr, ptr [[TMP54]], align 4 +// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP55]], ptr align 4 [[AGG_CAPTURED]], i32 4, i1 false) +// CHECK3-NEXT: [[TMP56:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP52]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP57:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP56]], i32 0, i32 0 +// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP57]], ptr align 4 @.offload_sizes.5, i32 8, i1 false) +// CHECK3-NEXT: [[TMP58:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 1 +// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP58]], ptr align 4 [[TMP48]], i32 4, i1 false) +// CHECK3-NEXT: [[TMP59:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP56]], i32 0, i32 2 +// CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP59]], ptr align 4 [[TMP49]], i32 4, i1 false) +// CHECK3-NEXT: [[TMP60:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP52]]) +// CHECK3-NEXT: [[TMP61:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP61]], 1 +// CHECK3-NEXT: ret i32 [[ADD12]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@_Z9ftemplateIiET_i +// CHECK3-SAME: (i32 noundef [[N:%.*]]) #[[ATTR0]] comdat { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK3-NEXT: [[B:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [3 x ptr], align 4 +// CHECK3-NEXT: [[KERNEL_ARGS1:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS]], align 8 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[A]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP0]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr null, ptr [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK3-NEXT: store ptr null, ptr [[TMP3]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK3-NEXT: store ptr null, ptr [[TMP4]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK3-NEXT: store ptr null, ptr [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP7]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP8]], align 8 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK3-NEXT: store i64 8, ptr [[TMP9]], align 8 +// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP10]], align 4 +// CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK3-NEXT: store i32 20, ptr [[TMP12]], align 4 +// CHECK3-NEXT: [[TMP13:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.region_id, ptr [[KERNEL_ARGS]]) +// CHECK3-NEXT: [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0 +// CHECK3-NEXT: br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK3: omp_offload.failed: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55() #[[ATTR2]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK3: omp_offload.cont: +// CHECK3-NEXT: store i16 1, ptr [[B]], align 2 +// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[B]], align 2 +// CHECK3-NEXT: store i16 [[TMP15]], ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: store i32 [[TMP16]], ptr [[A_CASTED]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[A_CASTED]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i16, ptr [[B]], align 2 +// CHECK3-NEXT: store i16 [[TMP18]], ptr [[B_CASTED]], align 2 +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK3-NEXT: store i16 [[TMP20]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 2 +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP22]], align 4 +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: store i32 [[TMP17]], ptr [[TMP23]], align 4 +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 0 +// CHECK3-NEXT: store ptr null, ptr [[TMP24]], align 4 +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP19]], ptr [[TMP25]], align 4 +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 1 +// CHECK3-NEXT: store i32 [[TMP19]], ptr [[TMP26]], align 4 +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 1 +// CHECK3-NEXT: store ptr null, ptr [[TMP27]], align 4 +// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP21]], ptr [[TMP28]], align 4 +// CHECK3-NEXT: [[TMP29:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 2 +// CHECK3-NEXT: store i32 [[TMP21]], ptr [[TMP29]], align 4 +// CHECK3-NEXT: [[TMP30:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i32 0, i32 2 +// CHECK3-NEXT: store ptr null, ptr [[TMP30]], align 4 +// CHECK3-NEXT: [[TMP31:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP32:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP33:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR_]], align 2 +// CHECK3-NEXT: [[TMP34:%.*]] = sext i16 [[TMP33]] to i32 +// CHECK3-NEXT: [[TMP35:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP34]], 0 +// CHECK3-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 0 +// CHECK3-NEXT: store i32 3, ptr [[TMP36]], align 4 +// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 1 +// CHECK3-NEXT: store i32 3, ptr [[TMP37]], align 4 +// CHECK3-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP31]], ptr [[TMP38]], align 4 +// CHECK3-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP32]], ptr [[TMP39]], align 4 +// CHECK3-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 4 +// CHECK3-NEXT: store ptr @.offload_sizes.7, ptr [[TMP40]], align 4 +// CHECK3-NEXT: [[TMP41:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes.8, ptr [[TMP41]], align 4 +// CHECK3-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP42]], align 4 +// CHECK3-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP43]], align 4 +// CHECK3-NEXT: [[TMP44:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP44]], align 8 +// CHECK3-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 9 +// CHECK3-NEXT: store i64 8, ptr [[TMP45]], align 8 +// CHECK3-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] [[TMP35]], ptr [[TMP46]], align 4 +// CHECK3-NEXT: [[TMP47:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP47]], align 4 +// CHECK3-NEXT: [[TMP48:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS1]], i32 0, i32 12 +// CHECK3-NEXT: store i32 1024, ptr [[TMP48]], align 4 +// CHECK3-NEXT: [[TMP49:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 [[TMP34]], i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.region_id, ptr [[KERNEL_ARGS1]]) +// CHECK3-NEXT: [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0 +// CHECK3-NEXT: br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]] +// CHECK3: omp_offload.failed2: +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[OMP_OFFLOAD_CONT3]] +// CHECK3: omp_offload.cont3: +// CHECK3-NEXT: [[TMP51:%.*]] = load i32, ptr [[A]], align 4 +// CHECK3-NEXT: ret i32 [[TMP51]] +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88 +// CHECK3-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double +// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK3-NEXT: store double [[ADD]], ptr [[A]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93 +// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK3-NEXT: store double 2.500000e+00, ptr [[A]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71 +// CHECK3-SAME: (i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4 +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]] +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK3-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]] +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP20]]) +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// CHECK3-NEXT: br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0 +// CHECK3-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1 +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1 +// CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK3-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK3-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK3-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK3: omp.precond.then: +// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK3-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK3-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK3: cond.true: +// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK3-NEXT: br label [[COND_END:%.*]] +// CHECK3: cond.false: +// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: br label [[COND_END]] +// CHECK3: cond.end: +// CHECK3-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK3-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK3-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK3: omp.inner.for.cond: +// CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]] +// CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK3-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK3: omp.inner.for.body: +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK3-NEXT: store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK3: omp.body.continue: +// CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK3: omp.inner.for.inc: +// CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK3-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]] +// CHECK3-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]] +// CHECK3: omp.inner.for.end: +// CHECK3-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK3: omp.loop.exit: +// CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +// CHECK3-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP18]]) +// CHECK3-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK3-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK3: .omp.final.then: +// CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK3-NEXT: [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK3-NEXT: [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1 +// CHECK3-NEXT: [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1 +// CHECK3-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]] +// CHECK3-NEXT: store i32 [[ADD10]], ptr [[I3]], align 4 +// CHECK3-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK3: .omp.final.done: +// CHECK3-NEXT: br label [[OMP_PRECOND_END]] +// CHECK3: omp.precond.end: +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75 +// CHECK3-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@.omp_task_privates_map. +// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTADDR2:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTADDR3:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4 +// CHECK3-NEXT: store ptr [[TMP2]], ptr [[DOTADDR2]], align 4 +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTADDR3]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTADDR3]], align 4 +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 1 +// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[DOTADDR1]], align 4 +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T]], ptr [[TMP4]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTADDR2]], align 4 +// CHECK3-NEXT: store ptr [[TMP9]], ptr [[TMP10]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry. +// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTPRIVATES__ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTCOPY_FN__ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTTASK_T__ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[__CONTEXT_ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR1_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTFIRSTPRIV_PTR_ADDR2_I:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[KERNEL_ARGS_I:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__CASTED_I:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store i32 [[TMP0]], ptr [[DOTADDR]], align 4 +// CHECK3-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTADDR]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP3]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2 +// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0 +// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1 +// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) +// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]]) +// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]]) +// CHECK3-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]]) +// CHECK3-NEXT: store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias [[META34:![0-9]+]] +// CHECK3-NEXT: store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]]) #[[ATTR2]] +// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK3-NEXT: store i32 3, ptr [[KERNEL_ARGS_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1 +// CHECK3-NEXT: store i32 1, ptr [[TMP16]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2 +// CHECK3-NEXT: store ptr [[TMP12]], ptr [[TMP17]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3 +// CHECK3-NEXT: store ptr [[TMP13]], ptr [[TMP18]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4 +// CHECK3-NEXT: store ptr [[TMP14]], ptr [[TMP19]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5 +// CHECK3-NEXT: store ptr @.offload_maptypes.6, ptr [[TMP20]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6 +// CHECK3-NEXT: store ptr null, ptr [[TMP21]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7 +// CHECK3-NEXT: store ptr null, ptr [[TMP22]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8 +// CHECK3-NEXT: store i64 0, ptr [[TMP23]], align 8, !noalias [[META34]] +// CHECK3-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9 +// CHECK3-NEXT: store i64 9, ptr [[TMP24]], align 8, !noalias [[META34]] +// CHECK3-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP25]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11 +// CHECK3-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP26]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12 +// CHECK3-NEXT: store i32 [[TMP15]], ptr [[TMP27]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP28:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 0, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.region_id, ptr [[KERNEL_ARGS_I]]) +// CHECK3-NEXT: [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0 +// CHECK3-NEXT: br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]] +// CHECK3: omp_offload.failed.i: +// CHECK3-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK3-NEXT: store i32 [[TMP30]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias [[META34]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75(i32 [[TMP31]]) #[[ATTR2]] +// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__EXIT]] +// CHECK3: .omp_outlined..exit: +// CHECK3-NEXT: ret i32 0 +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK3-SAME: () #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60 +// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) +// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2 +// CHECK3-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK3-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP2]], i32 0) +// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK3-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2 +// CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]]) +// CHECK3-NEXT: ret void +// +// +// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined +// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK3-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK3-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]] +// CHECK3-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4 +// CHECK3-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK9-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i64, ptr [[N_CASTED]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i64 [[TMP3]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[N_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +// CHECK9-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]] +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[TMP14:%.*]] = zext i32 [[TMP13]] to i64 +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 +// CHECK9-NEXT: [[TMP17:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: store i32 [[TMP17]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[TMP18:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i64 [[TMP14]], i64 [[TMP16]], i64 [[TMP18]]), !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]] +// CHECK9-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]] +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]]) +// CHECK9-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0 +// CHECK9-NEXT: br i1 [[TMP24]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP25]], 0 +// CHECK9-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1 +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1 +// CHECK9-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: [[I4:%.*]] = alloca i32, align 4 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[N]], ptr [[N_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK9-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK9-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK9-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK9: omp.precond.then: +// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV:%.*]] = trunc i64 [[TMP4]] to i32 +// CHECK9-NEXT: [[TMP5:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8 +// CHECK9-NEXT: [[CONV3:%.*]] = trunc i64 [[TMP5]] to i32 +// CHECK9-NEXT: store i32 [[CONV]], ptr [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[CONV3]], ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK9-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK9-NEXT: br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK9: cond.true: +// CHECK9-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK9-NEXT: br label [[COND_END:%.*]] +// CHECK9: cond.false: +// CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: br label [[COND_END]] +// CHECK9: cond.end: +// CHECK9-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK9-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK9-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK9-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK9: omp.inner.for.cond: +// CHECK9-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]] +// CHECK9-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: [[CMP6:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK9-NEXT: br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK9: omp.inner.for.body: +// CHECK9-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK9-NEXT: store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK9: omp.body.continue: +// CHECK9-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK9: omp.inner.for.inc: +// CHECK9-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK9-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]] +// CHECK9-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]] +// CHECK9: omp.inner.for.end: +// CHECK9-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK9: omp.loop.exit: +// CHECK9-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +// CHECK9-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]]) +// CHECK9-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK9-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK9-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK9: .omp.final.then: +// CHECK9-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK9-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK9-NEXT: [[DIV9:%.*]] = sdiv i32 [[SUB8]], 1 +// CHECK9-NEXT: [[MUL10:%.*]] = mul nsw i32 [[DIV9]], 1 +// CHECK9-NEXT: [[ADD11:%.*]] = add nsw i32 0, [[MUL10]] +// CHECK9-NEXT: store i32 [[ADD11]], ptr [[I4]], align 4 +// CHECK9-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK9: .omp.final.done: +// CHECK9-NEXT: br label [[OMP_PRECOND_END]] +// CHECK9: omp.precond.end: +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4 +// CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i64 [[TMP2]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK9-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double +// CHECK9-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK9-NEXT: store double [[ADD]], ptr [[A]], align 8 +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK9-NEXT: store double 2.500000e+00, ptr [[A]], align 8 +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60 +// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[A_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_CASTED:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK9-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2 +// CHECK9-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK9-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0) +// CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK9-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 +// CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8 +// CHECK9-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK9-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2 +// CHECK9-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8 +// CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i64 [[TMP4]], i64 [[TMP6]]) +// CHECK9-NEXT: ret void +// +// +// CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined +// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK9-NEXT: entry: +// CHECK9-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK9-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: [[B_ADDR:%.*]] = alloca i64, align 8 +// CHECK9-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK9-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK9-NEXT: store i64 [[B]], ptr [[B_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK9-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]] +// CHECK9-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4 +// CHECK9-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP1]], i32 0) +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP2]], ptr [[N_CASTED]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_CASTED]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined, i32 [[TMP3]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP8]], [[COND_TRUE]] ], [ [[TMP9]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB]], align 4 +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP10]], ptr [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]] +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP11]], [[TMP12]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: store i32 [[TMP15]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined, i32 [[TMP13]], i32 [[TMP14]], i32 [[TMP16]]), !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]] +// CHECK11-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]] +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP20]]) +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0 +// CHECK11-NEXT: br i1 [[TMP22]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB6:%.*]] = sub nsw i32 [[TMP23]], 0 +// CHECK11-NEXT: [[DIV7:%.*]] = sdiv i32 [[SUB6]], 1 +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[DIV7]], 1 +// CHECK11-NEXT: [[ADD8:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD8]], ptr [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l71.omp_outlined.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_LB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTPREVIOUS_UB__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[I3:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTPREVIOUS_UB_]], ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP1]], 0 +// CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK11-NEXT: [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK11-NEXT: store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP2]] +// CHECK11-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK11: omp.precond.then: +// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP4]], ptr [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK11-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK11-NEXT: br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK11: cond.true: +// CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4 +// CHECK11-NEXT: br label [[COND_END:%.*]] +// CHECK11: cond.false: +// CHECK11-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: br label [[COND_END]] +// CHECK11: cond.end: +// CHECK11-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK11-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK11-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK11-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK11: omp.inner.for.cond: +// CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]] +// CHECK11-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK11-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK11: omp.inner.for.body: +// CHECK11-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK11-NEXT: store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK11: omp.body.continue: +// CHECK11-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK11: omp.inner.for.inc: +// CHECK11-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1 +// CHECK11-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]] +// CHECK11-NEXT: br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]] +// CHECK11: omp.inner.for.end: +// CHECK11-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK11: omp.loop.exit: +// CHECK11-NEXT: [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: [[TMP18:%.*]] = load i32, ptr [[TMP17]], align 4 +// CHECK11-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP18]]) +// CHECK11-NEXT: [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK11-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP19]], 0 +// CHECK11-NEXT: br i1 [[TMP20]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]] +// CHECK11: .omp.final.then: +// CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK11-NEXT: [[SUB7:%.*]] = sub nsw i32 [[TMP21]], 0 +// CHECK11-NEXT: [[DIV8:%.*]] = sdiv i32 [[SUB7]], 1 +// CHECK11-NEXT: [[MUL9:%.*]] = mul nsw i32 [[DIV8]], 1 +// CHECK11-NEXT: [[ADD10:%.*]] = add nsw i32 0, [[MUL9]] +// CHECK11-NEXT: store i32 [[ADD10]], ptr [[I3]], align 4 +// CHECK11-NEXT: br label [[DOTOMP_FINAL_DONE]] +// CHECK11: .omp.final.done: +// CHECK11-NEXT: br label [[OMP_PRECOND_END]] +// CHECK11: omp.precond.end: +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l75.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP1]], ptr [[B_CASTED]], align 4 +// CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined, ptr [[TMP0]], i32 [[TMP2]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l88.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP1]] to double +// CHECK11-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00 +// CHECK11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK11-NEXT: store double [[ADD]], ptr [[A]], align 4 +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l93 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 +// CHECK11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK11-NEXT: store double 2.500000e+00, ptr [[A]], align 4 +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60 +// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[A_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_CASTED:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK11-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4 +// CHECK11-NEXT: [[TMP1:%.*]] = load i16, ptr [[DOTCAPTURE_EXPR__ADDR]], align 2 +// CHECK11-NEXT: [[TMP2:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK11-NEXT: call void @__kmpc_push_num_teams(ptr @[[GLOB3]], i32 [[TMP0]], i32 [[TMP2]], i32 0) +// CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 +// CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4 +// CHECK11-NEXT: [[TMP5:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK11-NEXT: store i16 [[TMP5]], ptr [[B_CASTED]], align 2 +// CHECK11-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4 +// CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined, i32 [[TMP4]], i32 [[TMP6]]) +// CHECK11-NEXT: ret void +// +// +// CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l60.omp_outlined +// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK11-NEXT: entry: +// CHECK11-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4 +// CHECK11-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4 +// CHECK11-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK11-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4 +// CHECK11-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: store i32 [[B]], ptr [[B_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load i16, ptr [[B_ADDR]], align 2 +// CHECK11-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +// CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[CONV]] +// CHECK11-NEXT: store i32 [[ADD]], ptr [[A_ADDR]], align 4 +// CHECK11-NEXT: ret void +// diff --git a/clang/test/Preprocessor/init-riscv.c b/clang/test/Preprocessor/init-riscv.c new file mode 100644 index 0000000000000..4eeecccff4378 --- /dev/null +++ b/clang/test/Preprocessor/init-riscv.c @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -E -dM -triple=riscv32 < /dev/null | \ +// RUN: FileCheck -match-full-lines -check-prefixes=RV32 %s +// RUN: %clang_cc1 -E -dM -triple=riscv64 < /dev/null | \ +// RUN: FileCheck -match-full-lines -check-prefixes=RV64 %s + +// RV32: #define __GCC_CONSTRUCTIVE_SIZE 64 +// RV32: #define __GCC_DESTRUCTIVE_SIZE 64 + +// RV64: #define __GCC_CONSTRUCTIVE_SIZE 64 +// RV64: #define __GCC_DESTRUCTIVE_SIZE 64 diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp index 97ac56236e799..2a2e19f9f17ec 100644 --- a/flang-rt/lib/runtime/environment.cpp +++ b/flang-rt/lib/runtime/environment.cpp @@ -17,6 +17,10 @@ #ifdef _WIN32 extern char **_environ; +#elif defined(__FreeBSD__) +// FreeBSD has environ in crt rather than libc. Using "extern char** environ" +// in the code of a shared library makes it fail to link with -Wl,--no-undefined +// See https://reviews.freebsd.org/D30842#840642 #else extern char **environ; #endif @@ -104,6 +108,11 @@ void ExecutionEnvironment::Configure(int ac, const char *av[], #ifdef _WIN32 envp = _environ; +#elif defined(__FreeBSD__) + auto envpp{reinterpret_cast(dlsym(RTLD_DEFAULT, "environ"))}; + if (envpp) { + envp = *envpp; + } #else envp = environ; #endif diff --git a/flang/include/flang/Optimizer/Builder/CUFCommon.h b/flang/include/flang/Optimizer/Builder/CUFCommon.h index 5c56dd6b695f8..6e2442745f9a0 100644 --- a/flang/include/flang/Optimizer/Builder/CUFCommon.h +++ b/flang/include/flang/Optimizer/Builder/CUFCommon.h @@ -18,6 +18,7 @@ static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem"; namespace fir { class FirOpBuilder; +class KindMapping; } // namespace fir namespace cuf { @@ -34,6 +35,10 @@ bool isRegisteredDeviceAttr(std::optional attr); void genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder); +int computeElementByteSize(mlir::Location loc, mlir::Type type, + fir::KindMapping &kindMap, + bool emitErrorOnFailure = true); + } // namespace cuf #endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_ diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index bae52d63fda45..289c79bd9b831 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -80,8 +80,7 @@ def AnyRefOfConstantSizeAggregateType : TypeConstraint< // Memory SSA operations //===----------------------------------------------------------------------===// -def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, - MemoryEffects<[MemAlloc]>]> { +def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments]> { let summary = "allocate storage for a temporary on the stack given a type"; let description = [{ This primitive operation is used to allocate an object on the stack. A @@ -162,7 +161,9 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, Variadic:$shape ); - let results = (outs fir_ReferenceType); + let results = + (outs Res]>:$res); let hasCustomAssemblyFormat = 1; let hasVerifier = 1; @@ -212,8 +213,7 @@ def fir_AllocaOp : fir_Op<"alloca", [AttrSizedOperandSegments, }]; } -def fir_AllocMemOp : fir_Op<"allocmem", - [MemoryEffects<[MemAlloc]>, AttrSizedOperandSegments]> { +def fir_AllocMemOp : fir_Op<"allocmem", [AttrSizedOperandSegments]> { let summary = "allocate storage on the heap for an object of a given type"; let description = [{ @@ -235,7 +235,7 @@ def fir_AllocMemOp : fir_Op<"allocmem", Variadic:$typeparams, Variadic:$shape ); - let results = (outs fir_HeapType); + let results = (outs Res]>:$res); let hasCustomAssemblyFormat = 1; let hasVerifier = 1; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index cab8063d0c850..ea1b53fb5185f 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1011,9 +1011,7 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::Firstprivate: - case DefMap::ImplicitBehavior::None: - TODO(loc, "Firstprivate and None are currently unsupported defaultmap " - "behaviour"); + TODO(loc, "Firstprivate is currently unsupported defaultmap behaviour"); break; case DefMap::ImplicitBehavior::From: return std::make_pair(mapFlag |= mlir::omp::ClauseMapFlags::from, @@ -1035,8 +1033,9 @@ getImplicitMapTypeAndKind(fir::FirOpBuilder &firOpBuilder, mlir::omp::VariableCaptureKind::ByRef); break; case DefMap::ImplicitBehavior::Default: + case DefMap::ImplicitBehavior::None: llvm_unreachable( - "Implicit None Behaviour Should Have Been Handled Earlier"); + "Implicit None and Default behaviour should have been handled earlier"); break; } @@ -1778,14 +1777,18 @@ static void genTaskloopClauses(lower::AbstractConverter &converter, mlir::omp::TaskloopOperands &clauseOps) { ClauseProcessor cp(converter, semaCtx, clauses); + cp.processAllocate(clauseOps); + cp.processFinal(stmtCtx, clauseOps); cp.processGrainsize(stmtCtx, clauseOps); + cp.processIf(llvm::omp::Directive::OMPD_taskloop, clauseOps); + cp.processMergeable(clauseOps); cp.processNumTasks(stmtCtx, clauseOps); + cp.processPriority(stmtCtx, clauseOps); + cp.processUntied(clauseOps); - cp.processTODO(loc, llvm::omp::Directive::OMPD_taskloop); + cp.processTODO( + loc, llvm::omp::Directive::OMPD_taskloop); } static void genTaskwaitClauses(lower::AbstractConverter &converter, diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 73ddd1ff80126..ef9894232b409 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -27,6 +27,26 @@ using namespace mlir; #define DEBUG_TYPE "fir-alias-analysis" +// Inspect for value-scoped Allocate effects and determine whether +// 'candidate' is a new allocation. Returns SourceKind::Allocate if a +// MemAlloc effect is attached +static fir::AliasAnalysis::SourceKind +classifyAllocateFromEffects(mlir::Operation *op, mlir::Value candidate) { + if (!op) + return fir::AliasAnalysis::SourceKind::Unknown; + auto interface = llvm::dyn_cast(op); + if (!interface) + return fir::AliasAnalysis::SourceKind::Unknown; + llvm::SmallVector effects; + interface.getEffects(effects); + for (mlir::MemoryEffects::EffectInstance &e : effects) { + if (mlir::isa(e.getEffect()) && + e.getValue() && e.getValue() == candidate) + return fir::AliasAnalysis::SourceKind::Allocate; + } + return fir::AliasAnalysis::SourceKind::Unknown; +} + //===----------------------------------------------------------------------===// // AliasAnalysis: alias //===----------------------------------------------------------------------===// @@ -535,6 +555,11 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, mlir::Operation *instantiationPoint{nullptr}; while (defOp && !breakFromLoop) { ty = defOp->getResultTypes()[0]; + // Value-scoped allocation detection via effects. + if (classifyAllocateFromEffects(defOp, v) == SourceKind::Allocate) { + type = SourceKind::Allocate; + break; + } llvm::TypeSwitch(defOp) .Case([&](auto op) { v = op.getVar(); @@ -554,11 +579,6 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, defOp = v.getDefiningOp(); } }) - .Case([&](auto op) { - // Unique memory allocation. - type = SourceKind::Allocate; - breakFromLoop = true; - }) .Case([&](auto op) { // Skip ConvertOp's and track further through the operand. v = op->getOperand(0); @@ -628,16 +648,23 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, type = SourceKind::Global; } else { auto def = llvm::cast(boxSrc.origin.u); - // TODO: Add support to fir.allocmem - if (auto allocOp = def.template getDefiningOp()) { - v = def; - defOp = v.getDefiningOp(); - type = SourceKind::Allocate; - } else if (isDummyArgument(def)) { - defOp = nullptr; - v = def; - } else { - type = SourceKind::Indirect; + bool classified = false; + if (auto defDefOp = def.getDefiningOp()) { + if (classifyAllocateFromEffects(defDefOp, def) == + SourceKind::Allocate) { + v = def; + defOp = defDefOp; + type = SourceKind::Allocate; + classified = true; + } + } + if (!classified) { + if (isDummyArgument(def)) { + defOp = nullptr; + v = def; + } else { + type = SourceKind::Indirect; + } } } breakFromLoop = true; diff --git a/flang/lib/Optimizer/Builder/CUFCommon.cpp b/flang/lib/Optimizer/Builder/CUFCommon.cpp index cf7588f275d22..461deb8e4cb55 100644 --- a/flang/lib/Optimizer/Builder/CUFCommon.cpp +++ b/flang/lib/Optimizer/Builder/CUFCommon.cpp @@ -9,6 +9,7 @@ #include "flang/Optimizer/Builder/CUFCommon.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/CUF/CUFOps.h" +#include "flang/Optimizer/Dialect/Support/KindMapping.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" @@ -91,3 +92,25 @@ void cuf::genPointerSync(const mlir::Value box, fir::FirOpBuilder &builder) { } } } + +int cuf::computeElementByteSize(mlir::Location loc, mlir::Type type, + fir::KindMapping &kindMap, + bool emitErrorOnFailure) { + auto eleTy = fir::unwrapSequenceType(type); + if (auto t{mlir::dyn_cast(eleTy)}) + return t.getWidth() / 8; + if (auto t{mlir::dyn_cast(eleTy)}) + return t.getWidth() / 8; + if (auto t{mlir::dyn_cast(eleTy)}) + return kindMap.getLogicalBitsize(t.getFKind()) / 8; + if (auto t{mlir::dyn_cast(eleTy)}) { + int elemSize = + mlir::cast(t.getElementType()).getWidth() / 8; + return 2 * elemSize; + } + if (auto t{mlir::dyn_cast(eleTy)}) + return kindMap.getCharacterBitsize(t.getFKind()) / 8; + if (emitErrorOnFailure) + mlir::emitError(loc, "unsupported type"); + return 0; +} diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 8d00272b09f42..5b1b0a2f6feab 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -263,28 +263,6 @@ static bool inDeviceContext(mlir::Operation *op) { return false; } -static int computeWidth(mlir::Location loc, mlir::Type type, - fir::KindMapping &kindMap) { - auto eleTy = fir::unwrapSequenceType(type); - if (auto t{mlir::dyn_cast(eleTy)}) - return t.getWidth() / 8; - if (auto t{mlir::dyn_cast(eleTy)}) - return t.getWidth() / 8; - if (eleTy.isInteger(1)) - return 1; - if (auto t{mlir::dyn_cast(eleTy)}) - return kindMap.getLogicalBitsize(t.getFKind()) / 8; - if (auto t{mlir::dyn_cast(eleTy)}) { - int elemSize = - mlir::cast(t.getElementType()).getWidth() / 8; - return 2 * elemSize; - } - if (auto t{mlir::dyn_cast_or_null(eleTy)}) - return kindMap.getCharacterBitsize(t.getFKind()) / 8; - mlir::emitError(loc, "unsupported type"); - return 0; -} - struct CUFAllocOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -320,7 +298,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { mlir::Value bytes; fir::KindMapping kindMap{fir::getKindMapping(mod)}; if (fir::isa_trivial(op.getInType())) { - int width = computeWidth(loc, op.getInType(), kindMap); + int width = cuf::computeElementByteSize(loc, op.getInType(), kindMap); bytes = builder.createIntegerConstant(loc, builder.getIndexType(), width); } else if (auto seqTy = mlir::dyn_cast_or_null( @@ -330,7 +308,7 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { mlir::Type structTy = typeConverter->convertType(seqTy.getEleTy()); size = dl->getTypeSizeInBits(structTy) / 8; } else { - size = computeWidth(loc, seqTy.getEleTy(), kindMap); + size = cuf::computeElementByteSize(loc, seqTy.getEleTy(), kindMap); } mlir::Value width = builder.createIntegerConstant(loc, builder.getIndexType(), size); @@ -704,7 +682,7 @@ struct CUFDataTransferOpConversion typeConverter->convertType(fir::unwrapSequenceType(dstTy)); width = dl->getTypeSizeInBits(structTy) / 8; } else { - width = computeWidth(loc, dstTy, kindMap); + width = cuf::computeElementByteSize(loc, dstTy, kindMap); } mlir::Value widthValue = mlir::arith::ConstantOp::create( rewriter, loc, i64Ty, rewriter.getIntegerAttr(i64Ty, width)); diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index a0779f353d213..3744e43e98a28 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -2966,6 +2966,67 @@ void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) { } } +static bool IsOpenMPPointer(const Symbol &symbol) { + if (IsPointer(symbol) || IsBuiltinCPtr(symbol)) + return true; + return false; +} + +static bool IsOpenMPAggregate(const Symbol &symbol) { + if (IsAllocatable(symbol) || IsOpenMPPointer(symbol)) + return false; + + const auto *type{symbol.GetType()}; + // OpenMP categorizes Fortran characters as aggregates. + if (type->category() == Fortran::semantics::DeclTypeSpec::Category::Character) + return true; + + if (const auto *det{symbol.GetUltimate() + .detailsIf()}) + if (det->IsArray()) + return true; + + if (type->AsDerived()) + return true; + + if (IsDeferredShape(symbol) || IsAssumedRank(symbol) || + IsAssumedShape(symbol)) + return true; + return false; +} + +static bool IsOpenMPScalar(const Symbol &symbol) { + if (IsOpenMPAggregate(symbol) || IsOpenMPPointer(symbol) || + IsAllocatable(symbol)) + return false; + const auto *type{symbol.GetType()}; + if ((!symbol.GetShape() || symbol.GetShape()->empty()) && + (type->category() == + Fortran::semantics::DeclTypeSpec::Category::Numeric || + type->category() == + Fortran::semantics::DeclTypeSpec::Category::Logical)) + return true; + return false; +} + +static bool DefaultMapCategoryMatchesSymbol( + parser::OmpVariableCategory::Value category, const Symbol &symbol) { + using VarCat = parser::OmpVariableCategory::Value; + switch (category) { + case VarCat::Scalar: + return IsOpenMPScalar(symbol); + case VarCat::Allocatable: + return IsAllocatable(symbol); + case VarCat::Aggregate: + return IsOpenMPAggregate(symbol); + case VarCat::Pointer: + return IsOpenMPPointer(symbol); + case VarCat::All: + return true; + } + return false; +} + // For OpenMP constructs, check all the data-refs within the constructs // and adjust the symbol for each Name if necessary void OmpAttributeVisitor::Post(const parser::Name &name) { @@ -3001,6 +3062,36 @@ void OmpAttributeVisitor::Post(const parser::Name &name) { } } + // TODO: handle case where default and defaultmap are present on the same + // construct and conflict, defaultmap should supersede default if they + // conflict. + if (!GetContext().defaultMap.empty()) { + // Checked before implicit data sharing attributes as this rule ignores + // them and expects explicit predetermined/specified attributes to be in + // place for the types specified. + if (Symbol * found{currScope().FindSymbol(name.source)}) { + // If the variable has declare target applied to it (enter or link) it + // is exempt from defaultmap(none) restrictions + if (!symbol->GetUltimate().test(Symbol::Flag::OmpDeclareTarget)) { + auto &dMap = GetContext().defaultMap; + for (auto defaults : dMap) { + if (defaults.second == + parser::OmpDefaultmapClause::ImplicitBehavior::None) { + if (DefaultMapCategoryMatchesSymbol(defaults.first, *found)) { + if (!IsObjectWithDSA(*symbol)) { + context_.Say(name.source, + "The DEFAULTMAP(NONE) clause requires that '%s' must be " + "listed in a " + "data-sharing attribute, data-mapping attribute, or is_device_ptr clause"_err_en_US, + symbol->name()); + } + } + } + } + } + } + } + if (Symbol * found{currScope().FindSymbol(name.source)}) { if (found->GetUltimate().test(semantics::Symbol::Flag::OmpThreadprivate)) return; diff --git a/flang/test/Driver/tco-emit-final-mlir.fir b/flang/test/Driver/tco-emit-final-mlir.fir index 75f8f153127af..7e934c921e773 100644 --- a/flang/test/Driver/tco-emit-final-mlir.fir +++ b/flang/test/Driver/tco-emit-final-mlir.fir @@ -15,5 +15,7 @@ func.func @_QPfoo() { %1 = fir.alloca i32 + %0 = arith.constant 0 : i32 + fir.store %0 to %1 : !fir.ref return } diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir index 8da8b828c18b9..613c8e274baad 100644 --- a/flang/test/Fir/alloc.fir +++ b/flang/test/Fir/alloc.fir @@ -372,8 +372,17 @@ func.func @alloca_unlimited_polymorphic_box() { %1 = fir.alloca !fir.class> %2 = fir.alloca !fir.box %3 = fir.alloca !fir.box> + // Add real uses so allocas are not trivially dead. + fir.call @__use_class_none(%0) : (!fir.ref>) -> () + fir.call @__use_class_array(%1) : (!fir.ref>>) -> () + fir.call @__use_box_none(%2) : (!fir.ref>) -> () + fir.call @__use_box_array(%3) : (!fir.ref>>) -> () return } +func.func private @__use_class_none(!fir.ref>) -> () +func.func private @__use_class_array(!fir.ref>>) -> () +func.func private @__use_box_none(!fir.ref>) -> () +func.func private @__use_box_array(!fir.ref>>) -> () // Note: allocmem of fir.box are not possible (fir::HeapType::verify does not // accept box types), so there is no equivalent of // alloca_unlimited_polymorphic_box for allocmem. diff --git a/flang/test/Fir/omp-reduction-embox-codegen.fir b/flang/test/Fir/omp-reduction-embox-codegen.fir index 1645e1a407ad4..47fffb35a7d92 100644 --- a/flang/test/Fir/omp-reduction-embox-codegen.fir +++ b/flang/test/Fir/omp-reduction-embox-codegen.fir @@ -28,9 +28,11 @@ func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { omp.parallel reduction(byref @test_reduction %4 -> %arg0 : !fir.ref>) { omp.terminator } + func.call @__use_box_i32(%4) : (!fir.ref>) -> () return } +func.func private @__use_box_i32(!fir.ref>) -> () // basically we are testing that there isn't a crash // CHECK-LABEL: define void @_QQmain // CHECK-NEXT: alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8 diff --git a/flang/test/Fir/pdt.fir b/flang/test/Fir/pdt.fir index a200cd7e7cc03..04f48e745d033 100644 --- a/flang/test/Fir/pdt.fir +++ b/flang/test/Fir/pdt.fir @@ -95,14 +95,14 @@ func.func @_QTt1P.f2.offset(%0 : i32, %1 : i32) -> i32 { // end program p func.func private @bar(!fir.ref>) +func.func private @__use_t1(!fir.ref,f2:!fir.char<1,?>}>>) -> () // CHECK-LABEL: define void @_QPfoo(i32 %0, i32 %1) func.func @_QPfoo(%arg0 : i32, %arg1 : i32) { // CHECK: %[[size:.*]] = call i64 @_QTt1P.mem.size(i32 %0, i32 %1) // CHECK: %[[alloc:.*]] = alloca i8, i64 %[[size]] %0 = fir.alloca !fir.type<_QTt1(p1:i32,p2:i32){f1:!fir.char<1,?>,f2:!fir.char<1,?>}>(%arg0, %arg1 : i32, i32) - //%2 = fir.coordinate_of %0, f2 : (!fir.ref>) -> !fir.ref> - %2 = fir.zero_bits !fir.ref> - fir.call @bar(%2) : (!fir.ref>) -> () + // Keep alloca live without creating an unsupported coordinate_of on dynamic-sized field. + func.call @__use_t1(%0) : (!fir.ref,f2:!fir.char<1,?>}>>) -> () return } diff --git a/flang/test/HLFIR/inline-hlfir-copy-in.fir b/flang/test/HLFIR/inline-hlfir-copy-in.fir index f3c4b38962a0c..f1da1da9f9a5c 100644 --- a/flang/test/HLFIR/inline-hlfir-copy-in.fir +++ b/flang/test/HLFIR/inline-hlfir-copy-in.fir @@ -75,7 +75,7 @@ func.func private @_test_inline_copy_in(%arg0: !fir.box> { // CHECK: %[[VAL_22:.*]] = fir.box_addr %[[VAL_21:.*]]#0 : (!fir.box>) -> !fir.ref> // CHECK: %[[VAL_23:.*]]:3 = hlfir.associate %[[VAL_5:.*]] {adapt.valuebyref} : (i32) -> (!fir.ref, !fir.ref, i1) // CHECK: fir.call @_QFPsb(%[[VAL_22:.*]], %[[VAL_23:.*]]#0) fastmath : (!fir.ref>, !fir.ref) -> () -// CHECK: hlfir.copy_out %16, %15#1 : (!fir.ref>>, i1) -> () +// CHECK: hlfir.copy_out %{{.*}}, %[[VAL_21:.*]]#1 : (!fir.ref>>, i1) -> () // CHECK: hlfir.end_associate %[[VAL_23:.*]]#1, %[[VAL_23:.*]]#2 : !fir.ref, i1 // CHECK: return // CHECK: } diff --git a/flang/test/Lower/Intrinsics/c_f_pointer.f90 b/flang/test/Lower/Intrinsics/c_f_pointer.f90 index c1f1d7972d4b1..f54fda42cf51b 100644 --- a/flang/test/Lower/Intrinsics/c_f_pointer.f90 +++ b/flang/test/Lower/Intrinsics/c_f_pointer.f90 @@ -153,7 +153,6 @@ subroutine dynamic_shape_lower(cptr, fpr, shape, lower) ! CHECK: %[[VAL_2:.*]] = fir.shape %[[C_0]], %[[C_0]] : (index, index) -> !fir.shape<2> ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_1:.*]](%[[VAL_2]]) : (!fir.ptr>, !fir.shape<2>) -> !fir.box>> ! CHECK: fir.store %[[VAL_3]] to %[[VAL_0:.*]] : !fir.ref>>> -! CHECK: %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFdynamic_shape_lowerEn"} ! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[ARG_0:.*]], __address : (!fir.ref>) -> !fir.ref ! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref ! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> !fir.ptr> diff --git a/flang/test/Lower/Intrinsics/system_clock.f90 b/flang/test/Lower/Intrinsics/system_clock.f90 index 9eae3a58884fa..f6fae1113b315 100644 --- a/flang/test/Lower/Intrinsics/system_clock.f90 +++ b/flang/test/Lower/Intrinsics/system_clock.f90 @@ -32,11 +32,9 @@ subroutine system_clock_test() ! CHECK-LABEL: @_QPss subroutine ss(count) - ! CHECK: %[[V_0:[0-9]+]] = fir.alloca !fir.box> {bindc_name = "count_max", uniq_name = "_QFssEcount_max"} ! CHECK: %[[V_1:[0-9]+]] = fir.alloca !fir.heap {uniq_name = "_QFssEcount_max.addr"} ! CHECK: %[[V_2:[0-9]+]] = fir.zero_bits !fir.heap ! CHECK: fir.store %[[V_2]] to %[[V_1]] : !fir.ref> - ! CHECK: %[[V_3:[0-9]+]] = fir.alloca !fir.box> {bindc_name = "count_rate", uniq_name = "_QFssEcount_rate"} ! CHECK: %[[V_4:[0-9]+]] = fir.alloca !fir.ptr {uniq_name = "_QFssEcount_rate.addr"} ! CHECK: %[[V_5:[0-9]+]] = fir.zero_bits !fir.ptr ! CHECK: fir.store %[[V_5]] to %[[V_4]] : !fir.ref> diff --git a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 index 6818c39f63a3c..1e0d9694258cc 100644 --- a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 +++ b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-firstprivate.f90 @@ -6,7 +6,7 @@ subroutine f00 ! NOTE: This is implemented for scalars as it is the default behaviour, so we utilise ! a different data type. integer, allocatable :: i - !CHECK: not yet implemented: Firstprivate and None are currently unsupported defaultmap behaviour + !CHECK: not yet implemented: Firstprivate is currently unsupported defaultmap behaviour !$omp target defaultmap(firstprivate) i = 10 !$omp end target diff --git a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90 b/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90 deleted file mode 100644 index 287eb4a9dfe8f..0000000000000 --- a/flang/test/Lower/OpenMP/Todo/defaultmap-clause-none.f90 +++ /dev/null @@ -1,11 +0,0 @@ -!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s -!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s - -subroutine f00 - implicit none - integer :: i - !CHECK: not yet implemented: Firstprivate and None are currently unsupported defaultmap behaviour - !$omp target defaultmap(none) - i = 10 - !$omp end target -end diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90 new file mode 100644 index 0000000000000..cd54f5eeba6c4 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskloop-collapse.f90 @@ -0,0 +1,15 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause COLLAPSE in TASKLOOP construct +subroutine omp_taskloop_collapse() + integer x + x = 0 + !$omp taskloop collapse(2) + do i = 1, 100 + do j = 1, 100 + x = x + 1 + end do + end do + !$omp end taskloop +end subroutine omp_taskloop_collapse diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90 new file mode 100644 index 0000000000000..54f2580daf283 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskloop-lastprivate.f90 @@ -0,0 +1,13 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause LASTPRIVATE in TASKLOOP construct +subroutine omp_taskloop_lastprivate() + integer x + x = 0 + !$omp taskloop lastprivate(x) + do i = 1, 100 + x = x + 1 + end do + !$omp end taskloop +end subroutine omp_taskloop_lastprivate diff --git a/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90 b/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90 new file mode 100644 index 0000000000000..2a0c5985290e2 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskloop-nogroup.f90 @@ -0,0 +1,13 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause NOGROUP in TASKLOOP construct +subroutine omp_taskloop_nogroup() + integer x + x = 0 + !$omp taskloop nogroup + do i = 1, 100 + x = x + 1 + end do + !$omp end taskloop +end subroutine omp_taskloop_nogroup diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90 index 9e8a41d0a5f69..e8a8670381d32 100644 --- a/flang/test/Lower/OpenMP/if-clause.f90 +++ b/flang/test/Lower/OpenMP/if-clause.f90 @@ -11,7 +11,7 @@ program main ! TODO When they are supported, add tests for: ! - PARALLEL SECTIONS ! - PARALLEL WORKSHARE - ! - TASKLOOP + ! - TARGET UPDATE ! - TASKLOOP SIMD ! ---------------------------------------------------------------------------- @@ -1595,4 +1595,29 @@ program main !$omp teams if(teams: .true.) i = 1 !$omp end teams + + ! ---------------------------------------------------------------------------- + ! TASKLOOP + ! ---------------------------------------------------------------------------- + + ! CHECK: omp.taskloop + ! CHECK-NOT: if({{.*}}) + !$omp taskloop + do i = 1, 10 + end do + !$omp end taskloop + + ! CHECK: omp.taskloop + ! CHECK-SAME: if({{.*}}) + !$omp taskloop if(.true.) + do i = 1, 10 + end do + !$omp end taskloop + + ! CHECK: omp.taskloop + ! CHECK-SAME: if({{.*}}) + !$omp taskloop if(taskloop: .true.) + do i = 1, 10 + end do + !$omp end taskloop end program main diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90 index 0d2db63edfe79..9d01460253899 100644 --- a/flang/test/Lower/OpenMP/implicit-dsa.f90 +++ b/flang/test/Lower/OpenMP/implicit-dsa.f90 @@ -5,6 +5,36 @@ ! Privatizers +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST3_I_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TASKLOOP_TEST3_X_FIRSTPRIVATE:.*]] : i32 +! CHECK-SAME: copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST2_X_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST2_I_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST1_I_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TASKLOOP_TEST1_X_FIRSTPRIVATE:.*]] : i32 +! CHECK-SAME: copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TASKLOOP_TEST1_Y_PRIVATE:.*]] : i32 +! CHECK-NOT: copy { + ! CHECK-LABEL: omp.private ! CHECK-SAME: {type = firstprivate} @[[TEST7_Y_FIRSTPRIV:.*]] : i32 ! CHECK-SAME: copy { @@ -310,4 +340,100 @@ subroutine implicit_dsa_test7 !$omp end task end subroutine -! TODO Test taskloop +! Test taskloop +! CHECK-LABEL: func.func @_QPimplicit_dsa_taskloop_test1 +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test1Ei"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} +! CHECK: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ALLOCA_Y]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_taskloop_test1Ez"} +! CHECK: %[[DECL_Z:.*]]:2 = hlfir.declare %[[ALLOCA_Z]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) +subroutine implicit_dsa_taskloop_test1 + integer :: x, y, z + ! CHECK: omp.taskloop private( + ! CHECK-SAME: @[[TASKLOOP_TEST1_Y_PRIVATE]] %[[DECL_Y]]#0 -> %[[ARG0:.*]], @[[TASKLOOP_TEST1_X_FIRSTPRIVATE]] %[[DECL_X]]#0 -> %[[ARG1:.*]], @[[TASKLOOP_TEST1_I_PRIVATE]] %[[DECL_I]]#0 -> %[[ARG2:.*]] : !fir.ref, !fir.ref, !fir.ref) { + ! CHECK: omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) inclusive step (%{{.*}}) { + !$omp taskloop private(y) shared(z) + do i = 1, 100 + ! CHECK: %[[Y_VAL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[X_VAL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFimplicit_dsa_taskloop_test1Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[DECL_Z]]#0 : !fir.ref + x = y + z + ! CHECK: hlfir.assign %{{.*}} to %[[X_VAL]]#0 : i32, !fir.ref + end do + !$omp end taskloop + + ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST1_I_PRIVATE]] %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref) { + !$omp taskloop default(shared) + do i = 1, 100 + ! CHECK: %[[LOAD_Y:.*]] = fir.load %[[DECL_Y]]#0 : !fir.ref + ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[DECL_Z]]#0 : !fir.ref + ! CHECK: %[[ADD_VAL:.*]] = arith.addi %[[LOAD_Y]], %[[LOAD_Z]] : i32 + x = y + z + ! CHECK: hlfir.assign %[[ADD_VAL]] to %[[DECL_X]]#0 : i32, !fir.ref + end do + !$omp end taskloop +end subroutine + +! Nested taskloop with implicit shared DSA variables. +! CHECK-LABEL: func @_QPimplicit_dsa_taskloop_test2 +! CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test2Ei"} +! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) +subroutine implicit_dsa_taskloop_test2 + integer :: x + ! CHECK: omp.parallel { + !$omp parallel + ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST2_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG0:.*]] : !fir.ref) { + !$omp taskloop + do i = 1, 100 + ! CHECK: hlfir.assign %{{.*}} to %[[X_DECL]]#0 : i32, !fir.ref + x = 2 + end do + !$omp end taskloop + + ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST2_X_PRIVATE]] %[[X_DECL]]#0 -> %[[ARG0]], @[[TASKLOOP_TEST2_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG1:.*]] : !fir.ref, !fir.ref) { + !$omp taskloop private(x) + do i = 1, 10 + ! CHECK: %[[DECL_PRIV_X:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test2Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[LOAD_X:.*]] = fir.load %[[DECL_PRIV_X]]#0 : !fir.ref + x = x + 1 + ! CHECK: hlfir.assign %{{.*}} to %[[DECL_PRIV_X]]#0 : i32, !fir.ref + end do + !$omp end parallel + +end subroutine + +! Taskloop with implicit firstprivate DSA variables, enclosed in private context. + +! CHECK-LABEL: func @_QPimplicit_dsa_taskloop_test3 +! CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFimplicit_dsa_taskloop_test3Ei"} +! CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} +! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_taskloop_test3Ey"} +! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_taskloop_test3Ez"} +! CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) + +subroutine implicit_dsa_taskloop_test3 + integer :: x, y, z + ! CHECK: omp.parallel private(@[[TASKLOOP_TEST3_X_FIRSTPRIVATE]] %[[X_DECL]]#0 -> %[[ARG0:.*]] : !fir.ref) { + ! CHECK: %[[X_PRIV_VAL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !$omp parallel firstprivate(x) + ! CHECK: omp.taskloop private(@[[TASKLOOP_TEST3_X_FIRSTPRIVATE]] %[[X_PRIV_VAL]]#0 -> %[[ARG1:.*]], @[[TASKLOOP_TEST3_I_PRIVATE]] %[[I_DECL]]#0 -> %[[ARG2:.*]] : !fir.ref, !fir.ref) { + !$omp taskloop + ! CHECK: %[[X_VAL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFimplicit_dsa_taskloop_test3Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) + do i = 1, 100 + ! CHECK: %[[LOAD_Y:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref + ! CHECK: %[[LOAD_Z:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref + x = y + z + ! CHECK: hlfir.assign %{{.*}} to %[[X_VAL]]#0 : i32, !fir.ref + end do + !$omp end taskloop + !$omp end parallel +end subroutine + diff --git a/flang/test/Lower/OpenMP/taskloop.f90 b/flang/test/Lower/OpenMP/taskloop.f90 index 79b0c20e176c0..4a06e4def0c83 100644 --- a/flang/test/Lower/OpenMP/taskloop.f90 +++ b/flang/test/Lower/OpenMP/taskloop.f90 @@ -1,5 +1,27 @@ -! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s -! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! REQUIRES: openmp_runtime +! RUN: bbc -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir %openmp_flags -o - %s 2>&1 | FileCheck %s + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[OMP_TASKLOOP_UNTIEDEI_PRIVATE_I32:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[QFTEST_PRIORITYEI_PRIVATE_I32:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[QFTEST_MERGEABLEEI_PRIVATE_I32:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[I_PRIVATE_IF_TEST1:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[I_PRIVATE_FINAL:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[I_PRIVATE_TEST_ALLOCATE:.*]] : i32 + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[X_PRIVATE_TEST_ALLOCATE:.*]] : i32 ! CHECK-LABEL: omp.private ! CHECK-SAME: {type = private} @[[I_PRIVATE_TEST2:.*]] : i32 @@ -70,3 +92,106 @@ subroutine omp_taskloop_private ! CHECK: } !$omp end taskloop end subroutine omp_taskloop_private + +!=============================================================================== +! `allocate` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPtaskloop_allocate +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtaskloop_allocateEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFtaskloop_allocateEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[ALLOCA_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtaskloop_allocateEx"} +! CHECK: %[[DECL_X:.*]]:2 = hlfir.declare %[[ALLOCA_X]] {uniq_name = "_QFtaskloop_allocateEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) +subroutine taskloop_allocate() + use omp_lib + integer :: x + ! CHECK: omp.taskloop allocate(%{{.*}} : i64 -> %[[DECL_X]]#0 : !fir.ref) + ! CHECK-SAME: private(@[[X_PRIVATE_TEST_ALLOCATE]] %[[DECL_X]]#0 -> %[[ARG0:.*]], @[[I_PRIVATE_TEST_ALLOCATE]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref, !fir.ref) { + !$omp taskloop allocate(omp_high_bw_mem_alloc: x) private(x) + do i = 1, 100 + ! CHECK: arith.addi + x = x + 12 + ! CHECK: omp.yield + end do + !$omp end taskloop +end subroutine taskloop_allocate + +!=============================================================================== +! `final` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPtaskloop_final +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtaskloop_finalEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFtaskloop_finalEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +subroutine taskloop_final() + ! CHECK: omp.taskloop final(%true) private(@[[I_PRIVATE_FINAL]] %[[DECL_I]]#0 -> %[[ARG0:.*]] : !fir.ref) { + !$omp taskloop final(.true.) + do i = 1, 100 + ! CHECK: fir.call @_QPfoo() + call foo() + end do + !$omp end taskloop +end subroutine + +!=============================================================================== +! `if` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPomp_taskloop_if +! CHECK: %[[DECL_BAR:.*]]:2 = hlfir.declare %[[ARG0:.*]] dummy_scope %{{.*}} +! CHECK: %[[ALLOCA_I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_taskloop_ifEi"} +! CHECK: %[[DECL_I:.*]]:2 = hlfir.declare %[[ALLOCA_I]] {uniq_name = "_QFomp_taskloop_ifEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[LOAD_VAL:.*]] = fir.load %[[DECL_BAR]]#0 : !fir.ref> +! CHECK: %[[VAL_BAR:.*]] = fir.convert %[[LOAD_VAL]] : (!fir.logical<4>) -> i1 +subroutine omp_taskloop_if(bar) + logical, intent(inout) :: bar + !CHECK: omp.taskloop if(%[[VAL_BAR]]) private(@[[I_PRIVATE_IF_TEST1]] %[[DECL_I]]#0 -> %[[ARG1:.*]] : !fir.ref) { + !$omp taskloop if(bar) + do i = 1, 10 + call foo() + end do + !$omp end taskloop +end subroutine omp_taskloop_if + +!=============================================================================== +! `mergeable` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPtest_mergeable +subroutine test_mergeable + ! CHECK: omp.taskloop mergeable + !$omp taskloop mergeable + do i = 1, 10 + end do + !$omp end taskloop +end subroutine test_mergeable + +!=============================================================================== +! `priority` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPtest_priority +! CHECK: %[[VAL1:.*]]:2 = hlfir.declare %[[ARG0:.*]] dummy_scope %{{.*}} +! CHECK: %[[LOAD_VAL:.*]] = fir.load %[[VAL1]]#0 : !fir.ref +subroutine test_priority(n) + integer, intent(inout) :: n + ! CHECK: omp.taskloop priority(%[[LOAD_VAL]] : i32) + !$omp taskloop priority(n) + do i = 1, 10 + end do + !$omp end taskloop +end subroutine test_priority + +!=============================================================================== +! `untied` clause +!=============================================================================== + +! CHECK-LABEL: func.func @_QPomp_taskloop_untied +subroutine omp_taskloop_untied() + ! CHECK: omp.taskloop untied + !$omp taskloop untied + do i = 1, 10 + call foo() + end do + !$omp end taskloop +end subroutine diff --git a/flang/test/Lower/allocatables.f90 b/flang/test/Lower/allocatables.f90 index e62f92fa0c1c7..60b7de3301c48 100644 --- a/flang/test/Lower/allocatables.f90 +++ b/flang/test/Lower/allocatables.f90 @@ -56,7 +56,7 @@ subroutine foodim1() ! CHECK-DAG: fir.load %[[xAddrVar]] : !fir.ref>> deallocate(x) - ! CHECK: %[[xAddr1:.*]] = fir.load %1 : !fir.ref>> + ! CHECK: %[[xAddr1:.*]] = fir.load %{{.*}} : !fir.ref>> ! CHECK: fir.freemem %[[xAddr1]] ! CHECK: %[[nullAddr1:.*]] = fir.zero_bits !fir.heap> ! CHECK: fir.store %[[nullAddr1]] to %[[xAddrVar]] : !fir.ref>> @@ -67,10 +67,6 @@ subroutine foodim2() ! Test lowering of local allocatable specification real, allocatable :: x(:, :) ! CHECK-DAG: fir.alloca !fir.heap> {{{.*}}uniq_name = "_QFfoodim2Ex.addr"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb0"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext0"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.lb1"} - ! CHECK-DAG: fir.alloca index {{{.*}}uniq_name = "_QFfoodim2Ex.ext1"} end subroutine ! test lowering of character allocatables. Focus is placed on the length handling diff --git a/flang/test/Lower/character-local-variables.f90 b/flang/test/Lower/character-local-variables.f90 index d5b959eca1ff6..6325229993a25 100644 --- a/flang/test/Lower/character-local-variables.f90 +++ b/flang/test/Lower/character-local-variables.f90 @@ -8,6 +8,7 @@ subroutine scalar_cst_len() character(10) :: c ! CHECK: fir.alloca !fir.char<1,10> {{{.*}}uniq_name = "_QFscalar_cst_lenEc"} + print *, c end subroutine ! CHECK-LABEL: func @_QPscalar_dyn_len @@ -19,12 +20,14 @@ subroutine scalar_dyn_len(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32 ! CHECK: fir.alloca !fir.char<1,?>(%[[l]] : i32) {{{.*}}uniq_name = "_QFscalar_dyn_lenEc"} + print *, c end subroutine ! CHECK-LABEL: func @_QPcst_array_cst_len subroutine cst_array_cst_len() character(10) :: c(20) ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPcst_array_dyn_len @@ -36,6 +39,7 @@ subroutine cst_array_dyn_len(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i32 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i32 ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i32) {{{.*}}uniq_name = "_QFcst_array_dyn_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPdyn_array_cst_len @@ -48,6 +52,7 @@ subroutine dyn_array_cst_len(n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_lenEc"} + print *, c(1) end subroutine ! CHECK: func @_QPdyn_array_dyn_len @@ -63,12 +68,14 @@ subroutine dyn_array_dyn_len(l, n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[ni]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[ni]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>(%[[l]] : i32), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_lenEc"} + print *, c(1) end subroutine ! CHECK-LABEL: func @_QPcst_array_cst_len_lb subroutine cst_array_cst_len_lb() character(10) :: c(11:30) ! CHECK: fir.alloca !fir.array<20x!fir.char<1,10>> {{{.*}}uniq_name = "_QFcst_array_cst_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPcst_array_dyn_len_lb @@ -80,6 +87,7 @@ subroutine cst_array_dyn_len_lb(l) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[lexpr]], %c0{{.*}} : i64 ! CHECK: %[[l:.*]] = arith.select %[[is_positive]], %[[lexpr]], %c0{{.*}} : i64 ! CHECK: fir.alloca !fir.array<10x!fir.char<1,?>>(%[[l]] : i64) {{{.*}}uniq_name = "_QFcst_array_dyn_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPdyn_array_cst_len_lb @@ -94,6 +102,7 @@ subroutine dyn_array_cst_len_lb(n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>, %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_cst_len_lbEc"} + print *, c(11) end subroutine ! CHECK-LABEL: func @_QPdyn_array_dyn_len_lb @@ -111,6 +120,7 @@ subroutine dyn_array_dyn_len_lb(l, n) ! CHECK: %[[is_positive:.*]] = arith.cmpi sgt, %[[raw_extent]], %c0{{.*}} : index ! CHECK: %[[extent:.*]] = arith.select %[[is_positive]], %[[raw_extent]], %c0{{.*}} : index ! CHECK: fir.alloca !fir.array>(%[[l]] : i64), %[[extent]] {{{.*}}uniq_name = "_QFdyn_array_dyn_len_lbEc"} + print *, c(11) end subroutine ! Test that the length of assumed length parameter is correctly deduced in lowering. @@ -129,4 +139,5 @@ subroutine assumed_length_param(n) subroutine scalar_cst_neg_len() character(-1) :: c ! CHECK: fir.alloca !fir.char<1,0> {{{.*}}uniq_name = "_QFscalar_cst_neg_lenEc"} + print *, c end subroutine diff --git a/flang/test/Lower/derived-types.f90 b/flang/test/Lower/derived-types.f90 index 4d36a7632b070..7e36ec0cfe93f 100644 --- a/flang/test/Lower/derived-types.f90 +++ b/flang/test/Lower/derived-types.f90 @@ -35,6 +35,8 @@ subroutine local_derived() ! CHECK-DAG: fir.alloca !fir.type<_QMdTr{x:f32}> type(r) :: some_r type(c2) :: some_c2 + print *, some_c2%ch_array(1,1) + print *, some_r%x end subroutine ! CHECK-LABEL: func @_QMdPsaved_derived( diff --git a/flang/test/Lower/do_loop_unstructured.f90 b/flang/test/Lower/do_loop_unstructured.f90 index 3b03850b43bb2..9c7d874a1aac8 100644 --- a/flang/test/Lower/do_loop_unstructured.f90 +++ b/flang/test/Lower/do_loop_unstructured.f90 @@ -235,6 +235,7 @@ subroutine nested_structured_in_unstructured() subroutine unstructured_do_concurrent logical :: success do concurrent (i=1:10) local(success) + success = .false. error stop "fail" enddo end diff --git a/flang/test/Lower/forall/array-pointer.f90 b/flang/test/Lower/forall/array-pointer.f90 index fd3efed736c39..6b8c5648af29e 100644 --- a/flang/test/Lower/forall/array-pointer.f90 +++ b/flang/test/Lower/forall/array-pointer.f90 @@ -318,7 +318,6 @@ end subroutine s2_3 ! CHECK-LABEL: func @_QPs2_3( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box>}>>> {fir.bindc_name = "x"}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box>> {bindc_name = "y", fir.target, uniq_name = "_QFs2_3Ey"} ! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFs2_3Ey.addr"} ! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.lb0"} ! CHECK: %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFs2_3Ey.ext0"} diff --git a/flang/test/Lower/forall/forall-allocatable.f90 b/flang/test/Lower/forall/forall-allocatable.f90 index 96cd37ea3ed8a..8e54d282aea4b 100644 --- a/flang/test/Lower/forall/forall-allocatable.f90 +++ b/flang/test/Lower/forall/forall-allocatable.f90 @@ -13,20 +13,19 @@ end subroutine forall_with_allocatable ! CHECK-LABEL: func @_QPforall_with_allocatable( ! CHECK-SAME: %[[VAL_0:.*]]: !fir.box>{{.*}}) { ! CHECK: %[[VAL_1:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} -! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box>> {bindc_name = "arr", uniq_name = "_QFforall_with_allocatableEarr"} -! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFforall_with_allocatableEarr.addr"} -! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"} -! CHECK: %[[VAL_5:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"} -! CHECK: %[[VAL_6:.*]] = fir.zero_bits !fir.heap> -! CHECK: fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref>> +! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.heap> {uniq_name = "_QFforall_with_allocatableEarr.addr"} +! CHECK: %[[VAL_3:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.lb0"} +! CHECK: %[[VAL_4:.*]] = fir.alloca index {uniq_name = "_QFforall_with_allocatableEarr.ext0"} +! CHECK: %[[VAL_5:.*]] = fir.zero_bits !fir.heap> +! CHECK: fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref>> ! CHECK: %[[VAL_7:.*]] = arith.constant 5 : i32 ! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index ! CHECK: %[[VAL_9:.*]] = arith.constant 15 : i32 ! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index ! CHECK: %[[VAL_11:.*]] = arith.constant 1 : index -! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_4]] : !fir.ref -! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_5]] : !fir.ref -! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_3]] : !fir.ref>> +! CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_3]] : !fir.ref +! CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_4]] : !fir.ref +! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_2]] : !fir.ref>> ! CHECK: %[[VAL_15:.*]] = fir.shape_shift %[[VAL_12]], %[[VAL_13]] : (index, index) -> !fir.shapeshift<1> ! CHECK: %[[VAL_16:.*]] = fir.array_load %[[VAL_14]](%[[VAL_15]]) : (!fir.heap>, !fir.shapeshift<1>) -> !fir.array ! CHECK: %[[VAL_17:.*]] = fir.array_load %[[VAL_0]] : (!fir.box>) -> !fir.array diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index 2fea84b03891a..5ee6562733dae 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -90,7 +90,6 @@ subroutine lis(n) ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}}, %{{.*}} {bindc_name = "a", fir.target, uniq_name = "_QFlisEa"} ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "r", uniq_name = "_QFlisEr"} ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "s", uniq_name = "_QFlisEs"} - ! CHECK-DAG: fir.alloca !fir.array, %{{.*}}, %{{.*}} {bindc_name = "t", uniq_name = "_QFlisEt"} integer, target :: a(n,n,n) ! operand via p integer :: r(n,n) ! result, unspecified locality integer :: s(n,n) ! shared locality diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90 index f586380e653a0..bc4eed54282df 100644 --- a/flang/test/Lower/polymorphic.f90 +++ b/flang/test/Lower/polymorphic.f90 @@ -287,7 +287,6 @@ subroutine pointer_assign_parent(p) ! First test is here to have a reference with non polymorphic on both sides. ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_parent( ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref> {fir.bindc_name = "p", fir.target}) { -! CHECK: %[[TP:.*]] = fir.alloca !fir.box>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp"} ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr> {uniq_name = "_QMpolymorphic_testFpointer_assign_parentEtp.addr"} ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr> ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref>> @@ -302,7 +301,6 @@ subroutine pointer_assign_non_poly(p) ! CHECK-LABEL: func.func @_QMpolymorphic_testPpointer_assign_non_poly( ! CHECK-SAME: %arg0: !fir.class> {fir.bindc_name = "p", fir.target}) { -! CHECK: %[[TP:.*]] = fir.alloca !fir.box>> {bindc_name = "tp", uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp"} ! CHECK: %[[PTR:.*]] = fir.alloca !fir.ptr> {uniq_name = "_QMpolymorphic_testFpointer_assign_non_polyEtp.addr"} ! CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr> ! CHECK: fir.store %[[ZERO]] to %[[PTR]] : !fir.ref>> @@ -1103,11 +1101,9 @@ subroutine class_with_entry(a) ! CHECK-LABEL: func.func @_QMpolymorphic_testPclass_with_entry( ! CHECK-SAME: %[[A:.*]]: !fir.class> {fir.bindc_name = "a"}) { -! CHECK: %[[B:.*]] = fir.alloca !fir.class> {bindc_name = "b", uniq_name = "_QMpolymorphic_testFclass_with_entryEb"} ! CHECK-LABEL: func.func @_QMpolymorphic_testPd( ! CHECK-SAME: %[[B:.*]]: !fir.class> {fir.bindc_name = "b"}) { -! CHECK: %[[A:.*]] = fir.alloca !fir.class> {bindc_name = "a", uniq_name = "_QMpolymorphic_testFclass_with_entryEa"} subroutine class_array_with_entry(a) class(p1) :: a(:), b(:) diff --git a/flang/test/Lower/statement-function.f90 b/flang/test/Lower/statement-function.f90 index cfec06c35baa8..fe07649e669af 100644 --- a/flang/test/Lower/statement-function.f90 +++ b/flang/test/Lower/statement-function.f90 @@ -129,7 +129,6 @@ integer function test_stmt_character_with_different_length_2(c, n) character(n) :: argc character(*) :: c ! CHECK: %[[unboxed:.*]]:2 = fir.unboxchar %[[arg0]] : - ! CHECK: fir.load %[[arg1]] : !fir.ref ! CHECK: %[[n:.*]] = fir.load %[[arg1]] : !fir.ref ! CHECK: %[[n_is_positive:.*]] = arith.cmpi sgt, %[[n]], %c0{{.*}} : i32 ! CHECK: %[[len:.*]] = arith.select %[[n_is_positive]], %[[n]], %c0{{.*}} : i32 diff --git a/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90 b/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90 new file mode 100644 index 0000000000000..08e8ebc995097 --- /dev/null +++ b/flang/test/Semantics/OpenMP/defaultmap-clause-none.f90 @@ -0,0 +1,96 @@ +! RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51 + +subroutine defaultmap_all_none_no_errors + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none) map(to: index, alloca) map(tofrom: array, ptr) + do index = 1, 10 + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_all_none_no_errors + +subroutine defaultmap_all_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + !$omp target defaultmap(none) +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'ptr' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'array' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'alloca' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_all_none + +subroutine defaultmap_scalar_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none: scalar) +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause +!ERROR: The DEFAULTMAP(NONE) clause requires that 'index' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_scalar_none + +subroutine defaultmap_pointer_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none: pointer) + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'ptr' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_pointer_none + +subroutine defaultmap_allocatable_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none: allocatable) + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'alloca' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_allocatable_none + +subroutine defaultmap_aggregate_none + implicit none + real :: array(10) + integer, pointer :: ptr(:) + real, allocatable :: alloca + integer :: index + + !$omp target defaultmap(none: aggregate) + do index = 1, 10 +!ERROR: The DEFAULTMAP(NONE) clause requires that 'array' must be listed in a data-sharing attribute, data-mapping attribute, or is_device_ptr clause + ptr(index) = array(index) + alloca + end do + !$omp end target +end subroutine defaultmap_aggregate_none diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir index 4a417ed981ab1..25fc73153003a 100644 --- a/flang/test/Transforms/stack-arrays.fir +++ b/flang/test/Transforms/stack-arrays.fir @@ -3,13 +3,17 @@ // Simplest transformation func.func @simple() { %0 = fir.allocmem !fir.array<42xi32> + %c0_s = arith.constant 0 : index + %c0_i32_s = arith.constant 0 : i32 + %ref_s = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_s = fir.coordinate_of %ref_s, %c0_s : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_s to %elt_s : !fir.ref fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @simple() { -// CHECK-NEXT: fir.alloca !fir.array<42xi32> -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK: func.func @simple() +// CHECK: fir.alloca !fir.array<42xi32> +// CHECK: return // Check fir.must_be_heap allocations are not moved func.func @must_be_heap() { @@ -17,7 +21,7 @@ func.func @must_be_heap() { fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @must_be_heap() { +// CHECK-LABEL: func.func @must_be_heap() // CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> {fir.must_be_heap = true} // CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap> // CHECK-NEXT: return @@ -36,7 +40,7 @@ func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { } return } -// CHECK: func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) { +// CHECK-LABEL: func.func @dfa1(%arg0: !fir.ref> {fir.bindc_name = "cond"}) // CHECK-NEXT: %[[C42:.*]] = arith.constant 42 : index // CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array, %[[C42]] {uniq_name = "_QFdfa1Earr.alloc"} // CHECK-NEXT: %[[LOGICAL:.*]] = fir.load %arg0 : !fir.ref> @@ -57,7 +61,7 @@ func.func @dfa2(%arg0: i1) { } return } -// CHECK: func.func @dfa2(%arg0: i1) { +// CHECK-LABEL: func.func @dfa2(%arg0: i1) // CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<1xi8> // CHECK-NEXT: scf.if %arg0 { // CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> @@ -74,15 +78,16 @@ func.func @dfa3(%arg0: i1) { } else { fir.freemem %a : !fir.heap> } + %c0_d3 = arith.constant 0 : index + %c0_i8_d3 = arith.constant 0 : i8 + %ref_d3 = fir.convert %a : (!fir.heap>) -> !fir.ref> + %elt_d3 = fir.coordinate_of %ref_d3, %c0_d3 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i8_d3 to %elt_d3 : !fir.ref return } -// CHECK: func.func @dfa3(%arg0: i1) { -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> -// CHECK-NEXT: fir.if %arg0 { -// CHECK-NEXT: } else { -// CHECK-NEXT: } -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK: func.func @dfa3(%arg0: i1) +// CHECK: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> +// CHECK: return func.func private @dfa3a_foo(!fir.ref>) -> () func.func private @dfa3a_bar(!fir.ref>) -> () @@ -101,7 +106,7 @@ func.func @dfa3a(%arg0: i1) { } return } -// CHECK: func.func @dfa3a(%arg0: i1) { +// CHECK-LABEL: func.func @dfa3a(%arg0: i1) // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<1xi8> // CHECK-NEXT: %[[HEAP:.*]] = fir.convert %[[MEM]] : (!fir.ref>) -> !fir.heap> // CHECK-NEXT: fir.if %arg0 { @@ -123,13 +128,18 @@ func.func @placement1() { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref1 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt1 = fir.coordinate_of %ref1, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt1 : !fir.ref fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @placement1() { +// CHECK-LABEL: func.func @placement1() // CHECK-NEXT: %[[ARG:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[ARG]] -// CHECK-NEXT: return +// CHECK: return // CHECK-NEXT: } // check that if there are no operands, then the alloca is placed early @@ -140,16 +150,21 @@ func.func @placement2() { %3 = arith.addi %1, %2 : index %4 = fir.allocmem !fir.array<42xi32> // ... + %c0_p2 = arith.constant 0 : index + %c0_i32_p2 = arith.constant 0 : i32 + %ref_p2 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt_p2 = fir.coordinate_of %ref_p2, %c0_p2 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_p2 to %elt_p2 : !fir.ref fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @placement2() { -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> -// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index -// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index -// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index -// CHECK-NEXT: return -// CHECK-NEXT: } +// CHECK-LABEL: func.func @placement2() +// CHECK: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> +// CHECK: %[[ONE:.*]] = arith.constant 1 : index +// CHECK: %[[TWO:.*]] = arith.constant 2 : index +// CHECK: %[[SUM:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK: return +// CHECK: } // check that stack allocations which must be placed in loops use stacksave func.func @placement3() { @@ -162,12 +177,17 @@ func.func @placement3() { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref2 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt2 = fir.coordinate_of %ref2, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt2 : !fir.ref fir.freemem %4 : !fir.heap> fir.result %3, %c1_i32 : index, i32 } return } -// CHECK: func.func @placement3() { +// CHECK-LABEL: func.func @placement3() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -176,7 +196,7 @@ func.func @placement3() { // CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index // CHECK-NEXT: %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[SUM]] -// CHECK-NEXT: llvm.intr.stackrestore %[[SP]] : !llvm.ptr +// CHECK: llvm.intr.stackrestore %[[SP]] : !llvm.ptr // CHECK-NEXT: fir.result // CHECK-NEXT: } // CHECK-NEXT: return @@ -194,12 +214,17 @@ func.func @placement4(%arg0 : i1) { // operand is now available %4 = fir.allocmem !fir.array, %3 // ... + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %ref3 = fir.convert %4 : (!fir.heap>) -> !fir.ref> + %elt3 = fir.coordinate_of %ref3, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32 to %elt3 : !fir.ref fir.freemem %4 : !fir.heap> cf.cond_br %arg0, ^bb1, ^bb2 ^bb2: return } -// CHECK: func.func @placement4(%arg0: i1) { +// CHECK-LABEL: func.func @placement4(%arg0: i1) // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index @@ -208,7 +233,7 @@ func.func @placement4(%arg0 : i1) { // CHECK-NEXT: %[[C3:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[SP:.*]] = llvm.intr.stacksave : !llvm.ptr // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[C3]] -// CHECK-NEXT: llvm.intr.stackrestore %[[SP]] : !llvm.ptr +// CHECK: llvm.intr.stackrestore %[[SP]] : !llvm.ptr // CHECK-NEXT: cf.cond_br %arg0, ^bb1, ^bb2 // CHECK-NEXT: ^bb2: // CHECK-NEXT: return @@ -230,7 +255,7 @@ func.func @placement5() { } return } -// CHECK: func.func @placement5() { +// CHECK-LABEL: func.func @placement5() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -268,7 +293,7 @@ func.func @placement6(%arg0: i1) { fir.freemem %4 : !fir.heap> cf.br ^bb1 } -// CHECK: func.func @placement6(%arg0: i1) { +// CHECK-LABEL: func.func @placement6(%arg0: i1) // CHECK-NEXT: %[[c1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[c1_i32:.*]] = fir.convert %[[c1]] : (index) -> i32 // CHECK-NEXT: %[[c2:.*]] = arith.constant 2 : index @@ -289,6 +314,11 @@ func.func @placement6(%arg0: i1) { // Check multiple returns, where the memory is always freed func.func @returns(%arg0: i1) { %0 = fir.allocmem !fir.array<42xi32> + %c0_ret = arith.constant 0 : index + %c0_i32_ret = arith.constant 0 : i32 + %ref_ret = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_ret = fir.coordinate_of %ref_ret, %c0_ret : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_ret to %elt_ret : !fir.ref cf.cond_br %arg0, ^bb1, ^bb2 ^bb1: fir.freemem %0 : !fir.heap> @@ -297,9 +327,9 @@ func.func @returns(%arg0: i1) { fir.freemem %0 : !fir.heap> return } -// CHECK: func.func @returns(%[[COND:.*]]: i1) { -// CHECK-NEXT: %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32> -// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2 +// CHECK-LABEL: func.func @returns( +// CHECK: %[[ALLOC:.*]] = fir.alloca !fir.array<42xi32> +// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 // CHECK-NEXT: ^bb1: // CHECK-NEXT: return // CHECK-NEXT: ^bb2: @@ -309,6 +339,11 @@ func.func @returns(%arg0: i1) { // Check multiple returns, where the memory is not freed on one branch func.func @returns2(%arg0: i1) { %0 = fir.allocmem !fir.array<42xi32> + %c0_ret2 = arith.constant 0 : index + %c0_i32_ret2 = arith.constant 0 : i32 + %ref_ret2 = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt_ret2 = fir.coordinate_of %ref_ret2, %c0_ret2 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_ret2 to %elt_ret2 : !fir.ref cf.cond_br %arg0, ^bb1, ^bb2 ^bb1: fir.freemem %0 : !fir.heap> @@ -316,9 +351,9 @@ func.func @returns2(%arg0: i1) { ^bb2: return } -// CHECK: func.func @returns2(%[[COND:.*]]: i1) { -// CHECK-NEXT: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> -// CHECK-NEXT: cf.cond_br %[[COND]], ^bb1, ^bb2 +// CHECK-LABEL: func.func @returns2( +// CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array<42xi32> +// CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 // CHECK-NEXT: ^bb1: // CHECK-NEXT: fir.freemem %[[ALLOC]] : !fir.heap> // CHECK-NEXT: return @@ -338,7 +373,7 @@ func.func @omp_placement1() { } return } -// CHECK: func.func @omp_placement1() { +// CHECK-LABEL: func.func @omp_placement1() // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> // CHECK-NEXT: %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref>) -> !fir.heap> // CHECK-NEXT: omp.sections { @@ -353,19 +388,21 @@ func.func @omp_placement1() { // function terminated by stop statement func.func @stop_terminator() { %0 = fir.allocmem !fir.array<42xi32> + %c0 = arith.constant 0 : index + %c0_i32_st = arith.constant 0 : i32 + %ref4 = fir.convert %0 : (!fir.heap>) -> !fir.ref> + %elt4 = fir.coordinate_of %ref4, %c0 : (!fir.ref>, index) -> !fir.ref + fir.store %c0_i32_st to %elt4 : !fir.ref fir.freemem %0 : !fir.heap> %c0_i32 = arith.constant 0 : i32 %false = arith.constant false fir.call @_FortranAStopStatement(%c0_i32, %false, %false) : (i32, i1, i1) -> () fir.unreachable } -// CHECK: func.func @stop_terminator() { -// CHECK-NEXT: fir.alloca !fir.array<42xi32> -// CHECK-NEXT: %[[ZERO:.*]] = arith.constant 0 : i32 -// CHECK-NEXT: %[[FALSE:.*]] = arith.constant false -// CHECK-NEXT: fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> () -// CHECK-NEXT: fir.unreachable -// CHECK-NEXT: } +// CHECK-LABEL: func.func @stop_terminator() +// CHECK: fir.alloca !fir.array<42xi32> +// CHECK: fir.call @_FortranAStopStatement( +// CHECK: fir.unreachable // check that stack allocations that use fir.declare which must be placed in loops @@ -387,7 +424,7 @@ func.func @placement_loop_declare() { } return } -// CHECK: func.func @placement_loop_declare() { +// CHECK-LABEL: func.func @placement_loop_declare() // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index @@ -415,7 +452,7 @@ func.func @lookthrough() { fir.freemem %4 : !fir.heap> return } -// CHECK: func.func @lookthrough() { +// CHECK-LABEL: func.func @lookthrough() // CHECK: fir.alloca !fir.array<42xi32> // CHECK-NOT: fir.freemem @@ -457,6 +494,6 @@ func.func @finding_freemem_in_block() { ^bb3: // pred: ^bb1 return } -// CHECK: func.func @finding_freemem_in_block() { +// CHECK-LABEL: func.func @finding_freemem_in_block() // CHECK: fir.alloca !fir.array // CHECK-NOT: fir.freemem diff --git a/libc/fuzzing/__support/freelist_heap_fuzz.cpp b/libc/fuzzing/__support/freelist_heap_fuzz.cpp index 7b7985a83c3e6..0b400cb156491 100644 --- a/libc/fuzzing/__support/freelist_heap_fuzz.cpp +++ b/libc/fuzzing/__support/freelist_heap_fuzz.cpp @@ -24,7 +24,7 @@ asm(R"( _end: .fill 1024 __llvm_libc_heap_limit: -)"; +)"); using LIBC_NAMESPACE::FreeListHeap; using LIBC_NAMESPACE::inline_memset; diff --git a/libc/fuzzing/string/CMakeLists.txt b/libc/fuzzing/string/CMakeLists.txt index efda80b59c951..0918e92552ea7 100644 --- a/libc/fuzzing/string/CMakeLists.txt +++ b/libc/fuzzing/string/CMakeLists.txt @@ -40,3 +40,11 @@ add_libc_fuzzer( DEPENDS libc.src.strings.bcmp ) + +add_libc_fuzzer( + strlen_fuzz + SRCS + strlen_fuzz.cpp + DEPENDS + libc.src.string.strlen +) diff --git a/libc/fuzzing/string/strlen_fuzz.cpp b/libc/fuzzing/string/strlen_fuzz.cpp new file mode 100644 index 0000000000000..dd72c19b7fdc7 --- /dev/null +++ b/libc/fuzzing/string/strlen_fuzz.cpp @@ -0,0 +1,32 @@ +//===-- strlen_fuzz.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Fuzzing test for llvm-libc strlen implementation. +/// +//===----------------------------------------------------------------------===// + +#include "src/string/strlen.h" +#include +#include + +// always null terminate the data +extern "C" size_t LLVMFuzzerMutate(uint8_t *data, size_t size, size_t max_size); +extern "C" size_t LLVMFuzzerCustomMutator(uint8_t *data, size_t size, + size_t max_size, unsigned int seed) { + size = LLVMFuzzerMutate(data, size, max_size); + data[size - 1] = '\0'; + return size; +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + size_t ref = ::strlen(reinterpret_cast(data)); + size_t impl = LIBC_NAMESPACE::strlen(reinterpret_cast(data)); + if (ref != impl) + __builtin_trap(); + return 0; +} diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp index bb76eee90efd2..08db4859c6417 100644 --- a/libc/src/__support/OSUtil/linux/fcntl.cpp +++ b/libc/src/__support/OSUtil/linux/fcntl.cpp @@ -66,7 +66,7 @@ ErrorOr fcntl(int fd, int cmd, void *arg) { LIBC_NAMESPACE::syscall_impl(FCNTL_SYSCALL_ID, fd, cmd, &flk64); // On failure, return if (ret < 0) - return Error(-1); + return Error(-ret); // Check for overflow, i.e. the offsets are not the same when cast // to off_t from off64_t. if (static_cast(flk64.l_len) != flk64.l_len || diff --git a/libc/src/stdio/printf_core/vfprintf_internal.h b/libc/src/stdio/printf_core/vfprintf_internal.h index 564441d3bf51a..c47a03d741f98 100644 --- a/libc/src/stdio/printf_core/vfprintf_internal.h +++ b/libc/src/stdio/printf_core/vfprintf_internal.h @@ -51,8 +51,11 @@ LIBC_INLINE void funlockfile(::FILE *f) { ::funlockfile(f); } LIBC_INLINE FileIOResult fwrite_unlocked(const void *ptr, size_t size, size_t nmemb, ::FILE *f) { // Need to use system errno in this case, as system write will set this errno - // which we need to propagate back into our code. - return {::fwrite_unlocked(ptr, size, nmemb, f), errno}; + // which we need to propagate back into our code. fwrite only modifies errno + // if there was an error, and errno may have previously been nonzero. Only + // return errno if there was an error. + size_t members_written = ::fwrite_unlocked(ptr, size, nmemb, f); + return {members_written, members_written == nmemb ? 0 : errno}; } #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE } // namespace internal diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h index 87f5ccdd56e23..eafaca9776a42 100644 --- a/libc/src/string/memory_utils/aarch64/inline_strlen.h +++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h @@ -8,14 +8,13 @@ #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H +#include "src/__support/macros/properties/cpu_features.h" + #if defined(__ARM_NEON) #include "src/__support/CPP/bit.h" // countr_zero - #include #include // size_t - namespace LIBC_NAMESPACE_DECL { - namespace neon { [[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t string_length(const char *src) { @@ -45,9 +44,63 @@ string_length(const char *src) { } } } // namespace neon +} // namespace LIBC_NAMESPACE_DECL +#endif // __ARM_NEON -namespace string_length_impl = neon; +#ifdef LIBC_TARGET_CPU_HAS_SVE +#include "src/__support/macros/optimization.h" +#include +namespace LIBC_NAMESPACE_DECL { +namespace sve { +[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) { + const uint8_t *ptr = reinterpret_cast(src); + // Initialize the first-fault register to all true + svsetffr(); + const svbool_t all_true = svptrue_b8(); // all true predicate + svbool_t cmp_zero; + size_t len = 0; + for (;;) { + // Read a vector's worth of bytes, stopping on first fault. + svuint8_t data = svldff1_u8(all_true, &ptr[len]); + svbool_t fault_mask = svrdffr_z(all_true); + bool has_no_fault = svptest_last(all_true, fault_mask); + if (LIBC_LIKELY(has_no_fault)) { + // First fault did not fail: the whole vector is valid. + // Avoid depending on the contents of FFR beyond the branch. + len += svcntb(); // speculative increment + cmp_zero = svcmpeq_n_u8(all_true, data, 0); + bool has_no_zero = !svptest_any(all_true, cmp_zero); + if (LIBC_LIKELY(has_no_zero)) + continue; + len -= svcntb(); // undo speculative increment + break; + } else { + // First fault failed: only some of the vector is valid. + // Perform the comparison only on the valid bytes. + cmp_zero = svcmpeq_n_u8(fault_mask, data, 0); + bool has_zero = svptest_any(fault_mask, cmp_zero); + if (LIBC_LIKELY(has_zero)) + break; + svsetffr(); + len += svcntp_b8(all_true, fault_mask); + continue; + } + } + // Select the bytes before the first and count them. + svbool_t before_zero = svbrkb_z(all_true, cmp_zero); + len += svcntp_b8(all_true, before_zero); + return len; +} +} // namespace sve +} // namespace LIBC_NAMESPACE_DECL +#endif // LIBC_TARGET_CPU_HAS_SVE + +namespace LIBC_NAMESPACE_DECL { +#ifdef LIBC_TARGET_CPU_HAS_SVE +namespace string_length_impl = sve; +#elif defined(__ARM_NEON) +namespace string_length_impl = neon; +#endif } // namespace LIBC_NAMESPACE_DECL -#endif // __ARM_NEON #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp index 84feb34e537a0..d008aea54b425 100644 --- a/libc/test/src/fcntl/fcntl_test.cpp +++ b/libc/test/src/fcntl/fcntl_test.cpp @@ -94,68 +94,105 @@ TEST_F(LlvmLibcFcntlTest, FcntlSetFl) { ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); } -TEST_F(LlvmLibcFcntlTest, FcntlGetLkRead) { - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test"; - auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); - - struct flock flk, svflk; - int retVal; - int fd = - LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(fd, 0); - - flk.l_type = F_RDLCK; - flk.l_start = 0; - flk.l_whence = SEEK_SET; - flk.l_len = 50; - - // copy flk into svflk - svflk = flk; - - retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - ASSERT_NE((int)flk.l_type, F_WRLCK); // File should not be write locked. - - retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - - ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); -} - -TEST_F(LlvmLibcFcntlTest, FcntlGetLkWrite) { - using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test"; - auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); - - struct flock flk, svflk; - int retVal; - int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(fd, 0); - - flk.l_type = F_WRLCK; - flk.l_start = 0; - flk.l_whence = SEEK_SET; - flk.l_len = 0; - - // copy flk into svflk - svflk = flk; - - retVal = LIBC_NAMESPACE::fcntl(fd, F_GETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - ASSERT_NE((int)flk.l_type, F_RDLCK); // File should not be read locked. - - retVal = LIBC_NAMESPACE::fcntl(fd, F_SETLK, &svflk); - ASSERT_ERRNO_SUCCESS(); - ASSERT_GT(retVal, -1); - - ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); -} +/* Tests that are common between OFD and traditional variants of fcntl locks. */ +template +class LibcFcntlCommonLockTests : public LlvmLibcFcntlTest { +public: + void GetLkRead() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkread.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + + struct flock flk = {}; + struct flock svflk = {}; + int retVal; + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDONLY, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + flk.l_type = F_RDLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 50; + + // copy flk into svflk + svflk = flk; + + retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + ASSERT_NE((int)svflk.l_type, F_WRLCK); // File should not be write locked. + + retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + } + + void GetLkWrite() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = "testdata/fcntl_getlkwrite.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + + struct flock flk = {}; + struct flock svflk = {}; + int retVal; + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(fd, 0); + + flk.l_type = F_WRLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 0; + + // copy flk into svflk + svflk = flk; + + retVal = LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + ASSERT_NE((int)svflk.l_type, F_RDLCK); // File should not be read locked. + + retVal = LIBC_NAMESPACE::fcntl(fd, SETLK_CMD, &svflk); + ASSERT_ERRNO_SUCCESS(); + ASSERT_GT(retVal, -1); + + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + } + + void UseAfterClose() { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *TEST_FILE_NAME = + "testdata/fcntl_use_after_close.test"; + const auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); + int fd = + LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + flock flk = {}; + flk.l_type = F_RDLCK; + flk.l_start = 0; + flk.l_whence = SEEK_SET; + flk.l_len = 50; + ASSERT_EQ(-1, LIBC_NAMESPACE::fcntl(fd, GETLK_CMD, &flk)); + ASSERT_ERRNO_EQ(EBADF); + } +}; + +#define COMMON_LOCK_TESTS(NAME, GETLK_CMD, SETLK_CMD) \ + using NAME = LibcFcntlCommonLockTests; \ + TEST_F(NAME, GetLkRead) { GetLkRead(); } \ + TEST_F(NAME, GetLkWrite) { GetLkWrite(); } \ + TEST_F(NAME, UseAfterClose) { UseAfterClose(); } \ + static_assert(true, "Require semicolon.") + +COMMON_LOCK_TESTS(LlvmLibcFcntlProcessAssociatedLockTest, F_GETLK, F_SETLK); +COMMON_LOCK_TESTS(LlvmLibcFcntlOpenFileDescriptionLockTest, F_OFD_GETLK, + F_OFD_SETLK); TEST_F(LlvmLibcFcntlTest, UseAfterClose) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; diff --git a/libc/test/src/string/strlen_test.cpp b/libc/test/src/string/strlen_test.cpp index 4eb9d47e9209d..784dd7b194b3f 100644 --- a/libc/test/src/string/strlen_test.cpp +++ b/libc/test/src/string/strlen_test.cpp @@ -22,3 +22,15 @@ TEST(LlvmLibcStrLenTest, AnyString) { size_t result = LIBC_NAMESPACE::strlen(any); ASSERT_EQ((size_t)12, result); } + +TEST(LlvmLibcStrLenTest, DataAfterNulString) { + constexpr char A[10] = {'a', 'b', 'c', 'd', 'e', 'f', 0, 'h', 'i', 'j'}; + size_t result = LIBC_NAMESPACE::strlen(A); + ASSERT_EQ((size_t)6, result); +} + +TEST(LlvmLibcStrLenTest, MultipleNulsInOneWord) { + constexpr char A[10] = {'a', 'b', 0, 'd', 'e', 'f', 0, 'h', 'i', 'j'}; + size_t result = LIBC_NAMESPACE::strlen(A); + ASSERT_EQ((size_t)2, result); +} diff --git a/libcxx/include/__memory/temp_value.h b/libcxx/include/__memory/temp_value.h index 4a133b3fbcf6c..5285bcab9a30d 100644 --- a/libcxx/include/__memory/temp_value.h +++ b/libcxx/include/__memory/temp_value.h @@ -12,7 +12,6 @@ #include <__config> #include <__memory/addressof.h> #include <__memory/allocator_traits.h> -#include <__type_traits/aligned_storage.h> #include <__utility/forward.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -26,7 +25,7 @@ struct __temp_value { typedef allocator_traits<_Alloc> _Traits; #ifdef _LIBCPP_CXX03_LANG - typename aligned_storage::type __v; + _ALIGNAS_TYPE(_Tp) char __v[sizeof(_Tp)]; #else union { _Tp __v; diff --git a/libcxx/include/any b/libcxx/include/any index 148fb16c802a5..b3e5b8748df4c 100644 --- a/libcxx/include/any +++ b/libcxx/include/any @@ -88,7 +88,6 @@ namespace std { # include <__new/allocate.h> # include <__type_traits/add_cv_quals.h> # include <__type_traits/add_pointer.h> -# include <__type_traits/aligned_storage.h> # include <__type_traits/conditional.h> # include <__type_traits/decay.h> # include <__type_traits/enable_if.h> @@ -147,14 +146,13 @@ template _LIBCPP_HIDE_FROM_ABI add_pointer_t<_ValueType> any_cast(any*) _NOEXCEPT; namespace __any_imp { -_LIBCPP_SUPPRESS_DEPRECATED_PUSH -using _Buffer _LIBCPP_NODEBUG = aligned_storage_t<3 * sizeof(void*), alignof(void*)>; -_LIBCPP_SUPPRESS_DEPRECATED_POP +inline constexpr size_t __small_buffer_size = 3 * sizeof(void*); +inline constexpr size_t __small_buffer_alignment = alignof(void*); template using _IsSmallObject _LIBCPP_NODEBUG = integral_constant::value >; enum class _Action { _Destroy, _Copy, _Move, _Get, _TypeInfo }; @@ -284,7 +282,7 @@ private: union _Storage { _LIBCPP_HIDE_FROM_ABI constexpr _Storage() : __ptr(nullptr) {} void* __ptr; - __any_imp::_Buffer __buf; + alignas(__any_imp::__small_buffer_alignment) char __buf[__any_imp::__small_buffer_size]; }; _LIBCPP_HIDE_FROM_ABI void* diff --git a/libcxx/include/future b/libcxx/include/future index 4b7c09841cbd3..0877d66602e6b 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -584,12 +584,9 @@ inline future_status __assoc_sub_state::wait_for(const chrono::duration<_Rep, _P template class _LIBCPP_HIDDEN __assoc_state : public __assoc_sub_state { typedef __assoc_sub_state base; - _LIBCPP_SUPPRESS_DEPRECATED_PUSH - typedef typename aligned_storage::type _Up; - _LIBCPP_SUPPRESS_DEPRECATED_POP protected: - _Up __value_; + _ALIGNAS_TYPE(_Rp) char __value_[sizeof(_Rp)]; _LIBCPP_HIDE_FROM_ABI_VIRTUAL void __on_zero_shared() _NOEXCEPT override; diff --git a/libcxx/src/exception.cpp b/libcxx/src/exception.cpp index ac6324cd9fe35..9932141006591 100644 --- a/libcxx/src/exception.cpp +++ b/libcxx/src/exception.cpp @@ -9,20 +9,12 @@ #define _LIBCPP_ENABLE_CXX20_REMOVED_UNCAUGHT_EXCEPTION #define _LIBCPP_DISABLE_DEPRECATION_WARNINGS -#include -#include -#include - -#if defined(LIBCXXRT) || defined(LIBCXX_BUILDING_LIBCXXABI) -# include -using namespace __cxxabiv1; -# define HAVE_DEPENDENT_EH_ABI 1 -#endif +#include <__config> #if defined(_LIBCPP_ABI_MICROSOFT) # include "support/runtime/exception_msvc.ipp" # include "support/runtime/exception_pointer_msvc.ipp" -#elif defined(_LIBCPPABI_VERSION) +#elif defined(LIBCXX_BUILDING_LIBCXXABI) # include "support/runtime/exception_libcxxabi.ipp" # include "support/runtime/exception_pointer_cxxabi.ipp" #elif defined(LIBCXXRT) diff --git a/libcxx/src/support/runtime/exception_fallback.ipp b/libcxx/src/support/runtime/exception_fallback.ipp index ba283aee22901..dca904e902da1 100644 --- a/libcxx/src/support/runtime/exception_fallback.ipp +++ b/libcxx/src/support/runtime/exception_fallback.ipp @@ -8,6 +8,8 @@ //===----------------------------------------------------------------------===// #include <__verbose_abort> +#include +#include "include/atomic_support.h" namespace std { diff --git a/libcxx/src/support/runtime/exception_glibcxx.ipp b/libcxx/src/support/runtime/exception_glibcxx.ipp index aa67cab6bc239..5eb8d87f6d4e1 100644 --- a/libcxx/src/support/runtime/exception_glibcxx.ipp +++ b/libcxx/src/support/runtime/exception_glibcxx.ipp @@ -11,6 +11,9 @@ # error header can only be used when targeting libstdc++ or libsupc++ #endif +#include +#include + namespace std { bad_alloc::bad_alloc() noexcept {} diff --git a/libcxx/src/support/runtime/exception_libcxxabi.ipp b/libcxx/src/support/runtime/exception_libcxxabi.ipp index df6bd6574bde2..c42bb237d9db8 100644 --- a/libcxx/src/support/runtime/exception_libcxxabi.ipp +++ b/libcxx/src/support/runtime/exception_libcxxabi.ipp @@ -7,6 +7,10 @@ // //===----------------------------------------------------------------------===// +#include + +#include + #ifndef _LIBCPPABI_VERSION # error this header can only be used with libc++abi #endif @@ -17,9 +21,9 @@ bool uncaught_exception() noexcept { return uncaught_exceptions() > 0; } int uncaught_exceptions() noexcept { #if _LIBCPPABI_VERSION > 1001 - return __cxa_uncaught_exceptions(); + return abi::__cxa_uncaught_exceptions(); #else - return __cxa_uncaught_exception() ? 1 : 0; + return abi::__cxa_uncaught_exception() ? 1 : 0; #endif } diff --git a/libcxx/src/support/runtime/exception_libcxxrt.ipp b/libcxx/src/support/runtime/exception_libcxxrt.ipp index f17fecc71e34b..6afdc006563c9 100644 --- a/libcxx/src/support/runtime/exception_libcxxrt.ipp +++ b/libcxx/src/support/runtime/exception_libcxxrt.ipp @@ -11,6 +11,8 @@ # error this header may only be used when targeting libcxxrt #endif +#include + namespace std { bad_exception::~bad_exception() noexcept {} diff --git a/libcxx/src/support/runtime/exception_msvc.ipp b/libcxx/src/support/runtime/exception_msvc.ipp index 2ae004bb02e5d..7114d90892cc1 100644 --- a/libcxx/src/support/runtime/exception_msvc.ipp +++ b/libcxx/src/support/runtime/exception_msvc.ipp @@ -12,6 +12,8 @@ #endif #include <__verbose_abort> +#include +#include extern "C" { typedef void(__cdecl* terminate_handler)(); diff --git a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp index 8f5c2060bb06c..75cb7c9d82ccd 100644 --- a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp +++ b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp @@ -7,22 +7,21 @@ // //===----------------------------------------------------------------------===// -#ifndef HAVE_DEPENDENT_EH_ABI -# error this header may only be used with libc++abi or libcxxrt -#endif +#include +#include namespace std { -exception_ptr::~exception_ptr() noexcept { __cxa_decrement_exception_refcount(__ptr_); } +exception_ptr::~exception_ptr() noexcept { abi::__cxa_decrement_exception_refcount(__ptr_); } exception_ptr::exception_ptr(const exception_ptr& other) noexcept : __ptr_(other.__ptr_) { - __cxa_increment_exception_refcount(__ptr_); + abi::__cxa_increment_exception_refcount(__ptr_); } exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept { if (__ptr_ != other.__ptr_) { - __cxa_increment_exception_refcount(other.__ptr_); - __cxa_decrement_exception_refcount(__ptr_); + abi::__cxa_increment_exception_refcount(other.__ptr_); + abi::__cxa_decrement_exception_refcount(__ptr_); __ptr_ = other.__ptr_; } return *this; @@ -31,7 +30,7 @@ exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept { exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept { exception_ptr ptr; ptr.__ptr_ = __e; - __cxa_increment_exception_refcount(ptr.__ptr_); + abi::__cxa_increment_exception_refcount(ptr.__ptr_); return ptr; } @@ -51,12 +50,12 @@ exception_ptr current_exception() noexcept { // this whole function would be just: // return exception_ptr(__cxa_current_primary_exception()); exception_ptr ptr; - ptr.__ptr_ = __cxa_current_primary_exception(); + ptr.__ptr_ = abi::__cxa_current_primary_exception(); return ptr; } void rethrow_exception(exception_ptr p) { - __cxa_rethrow_primary_exception(p.__ptr_); + abi::__cxa_rethrow_primary_exception(p.__ptr_); // if p.__ptr_ is NULL, above returns so we terminate terminate(); } diff --git a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp index 174b44ce0e6f7..4b08db6f1ae6f 100644 --- a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp +++ b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp @@ -16,6 +16,8 @@ // stable ABI), and its rethrow_exception(std::__exception_ptr::exception_ptr) // function. +#include + namespace std { namespace __exception_ptr { diff --git a/libcxx/src/support/runtime/exception_pointer_msvc.ipp b/libcxx/src/support/runtime/exception_pointer_msvc.ipp index 2be5136176e32..4141e0312349b 100644 --- a/libcxx/src/support/runtime/exception_pointer_msvc.ipp +++ b/libcxx/src/support/runtime/exception_pointer_msvc.ipp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include diff --git a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp index 05a71ce34e5ac..5e55f0f6dede3 100644 --- a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp +++ b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include <__verbose_abort> +#include namespace std { diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp index 5a5b57835379a..9d4c8344150f6 100644 --- a/libunwind/src/Registers.hpp +++ b/libunwind/src/Registers.hpp @@ -20,6 +20,11 @@ #include "libunwind_ext.h" #include "shadow_stack_unwind.h" +#if __has_include() +#include +#define HAVE_SYS_AUXV_H +#endif + namespace libunwind { // For emulating 128-bit registers @@ -1828,6 +1833,7 @@ inline const char *Registers_ppc64::getRegisterName(int regNum) { /// process. class _LIBUNWIND_HIDDEN Registers_arm64; extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *); +extern "C" int64_t __libunwind_Registers_arm64_za_disable(); #if defined(_LIBUNWIND_USE_GCS) extern "C" void *__libunwind_shstk_get_jump_target() { @@ -1837,7 +1843,7 @@ extern "C" void *__libunwind_shstk_get_jump_target() { class _LIBUNWIND_HIDDEN Registers_arm64 { public: - Registers_arm64(); + Registers_arm64() = default; Registers_arm64(const void *registers); Registers_arm64(const Registers_arm64 &); Registers_arm64 &operator=(const Registers_arm64 &); @@ -1855,7 +1861,10 @@ class _LIBUNWIND_HIDDEN Registers_arm64 { v128 getVectorRegister(int num) const; void setVectorRegister(int num, v128 value); static const char *getRegisterName(int num); - void jumpto() { __libunwind_Registers_arm64_jumpto(this); } + void jumpto() { + zaDisable(); + __libunwind_Registers_arm64_jumpto(this); + } static constexpr int lastDwarfRegNum() { return _LIBUNWIND_HIGHEST_DWARF_REGISTER_ARM64; } @@ -1908,25 +1917,43 @@ class _LIBUNWIND_HIDDEN Registers_arm64 { private: uint64_t lazyGetVG() const; + void zaDisable() const { + if (!_misc_registers.__has_sme) + return; + if (__libunwind_Registers_arm64_za_disable() != 0) + _LIBUNWIND_ABORT("SME ZA disable failed"); + } + + static bool checkHasSME() { +#if defined(HAVE_SYS_AUXV_H) + constexpr int hwcap2_sme = (1 << 23); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + return (hwcap2 & hwcap2_sme) != 0; +#endif + // TODO: Support other platforms. + return false; + } + struct GPRs { - uint64_t __x[29]; // x0-x28 - uint64_t __fp; // Frame pointer x29 - uint64_t __lr; // Link register x30 - uint64_t __sp; // Stack pointer x31 - uint64_t __pc; // Program counter - uint64_t __ra_sign_state; // RA sign state register + uint64_t __x[29] = {}; // x0-x28 + uint64_t __fp = 0; // Frame pointer x29 + uint64_t __lr = 0; // Link register x30 + uint64_t __sp = 0; // Stack pointer x31 + uint64_t __pc = 0; // Program counter + uint64_t __ra_sign_state = 0; // RA sign state register }; struct Misc { - mutable uint64_t __vg = 0; // Vector Granule + mutable uint32_t __vg = 0; // Vector Granule + bool __has_sme = checkHasSME(); }; - GPRs _registers; + GPRs _registers = {}; // Currently only the lower double in 128-bit vectore registers // is perserved during unwinding. We could define new register // numbers (> 96) which mean whole vector registers, then this // struct would need to change to contain whole vector registers. - double _vectorHalfRegisters[32]; + double _vectorHalfRegisters[32] = {}; // Miscellaneous/virtual registers. These are stored below the GPRs and FPRs // as they do not correspond to physical registers, so do not need to be @@ -1971,10 +1998,6 @@ Registers_arm64::operator=(const Registers_arm64 &other) { return *this; } -inline Registers_arm64::Registers_arm64() { - memset(static_cast(this), 0, sizeof(*this)); -} - inline bool Registers_arm64::validRegister(int regNum) const { if (regNum == UNW_REG_IP) return true; diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S index b7ddd0a621d18..f988fd461def1 100644 --- a/libunwind/src/UnwindRegistersSave.S +++ b/libunwind/src/UnwindRegistersSave.S @@ -829,6 +829,68 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) ret #endif +// +// extern "C" int64_t __libunwind_Registers_arm64_za_disable() +// +// This function implements the requirements of the __arm_za_disable ABI +// routine, except that it will not abort; it will return a non-zero value +// to signify the routine failed. +// +// Note: This function uses SME instructions. It must only be called if SME +// has been confirmed to be available. +// +// On return: +// +// A status is placed in x0. A zero value indicates success; any non-zero +// value indicates failure. +// + .p2align 2 +DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_za_disable) + .variant_pcs __libunwind_Registers_arm64_za_disable +#if __has_feature(ptrauth_calls) + pacibsp +#endif + // If TPIDR2_EL0 is null, the subroutine just disables ZA. + .inst 0xd53bd0b0 // mrs x16, TPIDR2_EL0 + cbz x16, 1f + + // If any of the reserved bytes in the first 16 bytes of the TPIDR2 block are + // nonzero, return a non-zero value (libunwind will then abort). + ldrh w0, [x16, #10] + cbnz w0, 2f + ldr w0, [x16, #12] + cbnz w0, 2f + + // If num_za_save_slices is zero, the subroutine just disables ZA. + ldrh w0, [x16, #8] + cbz x0, 1f + + // If za_save_buffer is NULL, the subroutine just disables ZA. + ldr x16, [x16] + cbz x16, 1f + + // Store ZA to za_save_buffer. + mov x15, xzr +0: + .inst 0xe1206200 // str za[w15,0], [x16] + .inst 0x04305830 // addsvl x16, x16, #1 + add x15, x15, #1 + cmp x0, x15 + b.ne 0b +1: + // * Set TPIDR2_EL0 to null. + .inst 0xd51bd0bf // msr TPIDR2_EL0, xzr + // * Set PSTATE.ZA to 0. + .inst 0xd503447f // smstop za + // * Return zero (success) + mov x0, xzr +2: +#if __has_feature(ptrauth_calls) + retab +#else + ret +#endif + #elif defined(__arm__) && !defined(__APPLE__) #if !defined(__ARM_ARCH_ISA_ARM) diff --git a/libunwind/test/aarch64_za_unwind.pass.cpp b/libunwind/test/aarch64_za_unwind.pass.cpp new file mode 100644 index 0000000000000..2985bb8d298de --- /dev/null +++ b/libunwind/test/aarch64_za_unwind.pass.cpp @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: linux && target={{aarch64-.+}} + +#include +#include +#include +#include +#include +#include + +// Basic test of unwinding with SME lazy saves. This tests libunwind disables ZA +// (and commits a lazy save of ZA) before resuming from unwinding. + +// Note: This test requires SME (and is setup to pass on targets without SME). + +static bool checkHasSME() { + constexpr int hwcap2_sme = (1 << 23); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + return (hwcap2 & hwcap2_sme) != 0; +} + +struct TPIDR2Block { + void *za_save_buffer; + uint64_t num_save_slices; +}; + +__attribute__((noinline)) void private_za() { + // Note: Lazy save active on entry to function. + unw_context_t context; + unw_cursor_t cursor; + + unw_getcontext(&context); + unw_init_local(&cursor, &context); + unw_step(&cursor); + unw_resume(&cursor); +} + +bool isZAOn() { + register uint64_t svcr asm("x20"); + asm(".inst 0xd53b4254" : "=r"(svcr)); + return (svcr & 0b10) != 0; +} + +__attribute__((noinline)) void za_function_with_lazy_save() { + register uint64_t tmp asm("x8"); + + // SMSTART ZA (should zero ZA) + asm(".inst 0xd503457f"); + + // RDSVL x8, #1 (read streaming vector length) + asm(".inst 0x04bf5828" : "=r"(tmp)); + + // Allocate and fill ZA save buffer with 0xAA. + size_t buffer_size = tmp * tmp; + uint8_t *za_save_buffer = (uint8_t *)alloca(buffer_size); + memset(za_save_buffer, 0xAA, buffer_size); + + TPIDR2Block block = {za_save_buffer, tmp}; + tmp = reinterpret_cast(&block); + + // MRS TPIDR2_EL0, x8 (setup lazy save of ZA) + asm(".inst 0xd51bd0a8" ::"r"(tmp)); + + // ZA should be on before unwinding. + if (!isZAOn()) { + fprintf(stderr, __FILE__ ": fail (ZA not on before call)\n"); + abort(); + } else { + fprintf(stderr, __FILE__ ": pass (ZA on before call)\n"); + } + + private_za(); + + // ZA should be off after unwinding. + if (isZAOn()) { + fprintf(stderr, __FILE__ ": fail (ZA on after unwinding)\n"); + abort(); + } else { + fprintf(stderr, __FILE__ ": pass (ZA off after unwinding)\n"); + } + + // MRS x8, TPIDR2_EL0 (read TPIDR2_EL0) + asm(".inst 0xd53bd0a8" : "=r"(tmp)); + // ZA should have been saved (TPIDR2_EL0 zero). + if (tmp != 0) { + fprintf(stderr, __FILE__ ": fail (TPIDR2_EL0 non-null after unwinding)\n"); + abort(); + } else { + fprintf(stderr, __FILE__ ": pass (TPIDR2_EL0 null after unwinding)\n"); + } + + // ZA (all zero) should have been saved to the buffer. + for (unsigned i = 0; i < buffer_size; ++i) { + if (za_save_buffer[i] != 0) { + fprintf(stderr, + __FILE__ ": fail (za_save_buffer non-zero after unwinding)\n"); + abort(); + } + } + fprintf(stderr, __FILE__ ": pass (za_save_buffer zero'd after unwinding)\n"); +} + +int main(int, char **) { + if (!checkHasSME()) { + fprintf(stderr, __FILE__ ": pass (no SME support)\n"); + return 0; // Pass (SME is required for this test to run). + } + za_function_with_lazy_save(); + return 0; +} diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h index 124cb55eaf723..57acb82dd96e9 100644 --- a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h +++ b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h @@ -32,7 +32,8 @@ class BreakpointLocationCollection { ~BreakpointLocationCollection(); - BreakpointLocationCollection &operator=(const BreakpointLocationCollection &rhs); + BreakpointLocationCollection & + operator=(const BreakpointLocationCollection &rhs); /// Add the breakpoint \a bp_loc_sp to the list. /// @@ -172,17 +173,18 @@ class BreakpointLocationCollection { lldb::break_id_t break_loc_id) const; collection m_break_loc_collection; - mutable std::mutex m_collection_mutex; + mutable std::recursive_mutex m_collection_mutex; /// These are used if we're preserving breakpoints in this list: const bool m_preserving_bkpts = false; std::map, lldb::BreakpointSP> m_preserved_bps; public: - typedef llvm::iterator_range + typedef LockingAdaptedIterable BreakpointLocationCollectionIterable; BreakpointLocationCollectionIterable BreakpointLocations() { - return BreakpointLocationCollectionIterable(m_break_loc_collection); + return BreakpointLocationCollectionIterable(m_break_loc_collection, + m_collection_mutex); } }; } // namespace lldb_private diff --git a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp index 97715836ec104..adff4299a5289 100644 --- a/lldb/source/Breakpoint/BreakpointLocationCollection.cpp +++ b/lldb/source/Breakpoint/BreakpointLocationCollection.cpp @@ -24,7 +24,7 @@ BreakpointLocationCollection::BreakpointLocationCollection(bool preserving) BreakpointLocationCollection::~BreakpointLocationCollection() = default; void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) { - std::lock_guard guard(m_collection_mutex); + std::lock_guard guard(m_collection_mutex); BreakpointLocationSP old_bp_loc = FindByIDPair(bp_loc->GetBreakpoint().GetID(), bp_loc->GetID()); if (!old_bp_loc.get()) { @@ -44,7 +44,7 @@ void BreakpointLocationCollection::Add(const BreakpointLocationSP &bp_loc) { bool BreakpointLocationCollection::Remove(lldb::break_id_t bp_id, lldb::break_id_t bp_loc_id) { - std::lock_guard guard(m_collection_mutex); + std::lock_guard guard(m_collection_mutex); collection::iterator pos = GetIDPairIterator(bp_id, bp_loc_id); // Predicate if (pos != m_break_loc_collection.end()) { if (m_preserving_bkpts) { @@ -117,7 +117,7 @@ const BreakpointLocationSP BreakpointLocationCollection::FindByIDPair( } BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) { - std::lock_guard guard(m_collection_mutex); + std::lock_guard guard(m_collection_mutex); BreakpointLocationSP stop_sp; if (i < m_break_loc_collection.size()) stop_sp = m_break_loc_collection[i]; @@ -127,7 +127,7 @@ BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) { const BreakpointLocationSP BreakpointLocationCollection::GetByIndex(size_t i) const { - std::lock_guard guard(m_collection_mutex); + std::lock_guard guard(m_collection_mutex); BreakpointLocationSP stop_sp; if (i < m_break_loc_collection.size()) stop_sp = m_break_loc_collection[i]; @@ -168,7 +168,7 @@ bool BreakpointLocationCollection::ShouldStop( } bool BreakpointLocationCollection::ValidForThisThread(Thread &thread) { - std::lock_guard guard(m_collection_mutex); + std::lock_guard guard(m_collection_mutex); collection::iterator pos, begin = m_break_loc_collection.begin(), end = m_break_loc_collection.end(); @@ -180,7 +180,7 @@ bool BreakpointLocationCollection::ValidForThisThread(Thread &thread) { } bool BreakpointLocationCollection::IsInternal() const { - std::lock_guard guard(m_collection_mutex); + std::lock_guard guard(m_collection_mutex); collection::const_iterator pos, begin = m_break_loc_collection.begin(), end = m_break_loc_collection.end(); @@ -197,7 +197,7 @@ bool BreakpointLocationCollection::IsInternal() const { void BreakpointLocationCollection::GetDescription( Stream *s, lldb::DescriptionLevel level) { - std::lock_guard guard(m_collection_mutex); + std::lock_guard guard(m_collection_mutex); collection::iterator pos, begin = m_break_loc_collection.begin(), end = m_break_loc_collection.end(); @@ -212,8 +212,10 @@ BreakpointLocationCollection &BreakpointLocationCollection::operator=( const BreakpointLocationCollection &rhs) { if (this != &rhs) { std::lock(m_collection_mutex, rhs.m_collection_mutex); - std::lock_guard lhs_guard(m_collection_mutex, std::adopt_lock); - std::lock_guard rhs_guard(rhs.m_collection_mutex, std::adopt_lock); + std::lock_guard lhs_guard(m_collection_mutex, + std::adopt_lock); + std::lock_guard rhs_guard(rhs.m_collection_mutex, + std::adopt_lock); m_break_loc_collection = rhs.m_break_loc_collection; } return *this; diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py index e772965f5bb85..bc53feaafa635 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/after_rebuild/TestLocationsAfterRebuild.py @@ -7,7 +7,7 @@ import lldb import lldbsuite.test.lldbutil as lldbutil from lldbsuite.test.lldbtest import * -from lldbsuite.test.decorators import skipIfWindows, skipIfRemote +from lldbsuite.test.decorators import skipIfWindows import os @@ -19,7 +19,6 @@ class TestLocationsAfterRebuild(TestBase): # On Windows we cannot remove a file that lldb is debugging. @skipIfWindows - @skipIfRemote def test_remaining_location_spec(self): """If we rebuild a couple of times some of the old locations get removed. Make sure the command-line breakpoint id @@ -55,6 +54,24 @@ def test_remaining_location_spec(self): self, target, bkpt ) + # After enabling locate_module callback for main executables, + # the number of locations may vary depending on the platform. + num_locs = bkpt.GetNumLocations() bkpt_id = bkpt.GetID() - loc_string = f"{bkpt_id}.3" - self.runCmd(f"break disable {loc_string}") + + self.assertGreater( + num_locs, + 0, + f"Expected at least one breakpoint location, but found {num_locs}", + ) + + # Iterate through all valid locations and verify we can disable each one. + # This tests that breakpoint location IDs remain valid after rebuilds. + for loc_idx in range(num_locs): + loc = bkpt.GetLocationAtIndex(loc_idx) + self.assertTrue(loc.IsValid(), f"Location at index {loc_idx} is not valid") + + # Get the actual location ID from the location object + loc_id = loc.GetID() + loc_string = f"{bkpt_id}.{loc_id}" + self.runCmd(f"break disable {loc_string}") diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list index 5900fe8c35069..d8caa20ad3550 100644 --- a/lldb/test/API/functionalities/thread/step_until/function.list +++ b/lldb/test/API/functionalities/thread/step_until/function.list @@ -1 +1,4 @@ -!call_me +v1 +f call_me +c 0 +c 1 diff --git a/lldb/test/Shell/helper/build.py b/lldb/test/Shell/helper/build.py index a5a7e997be044..1fa8aab92c128 100755 --- a/lldb/test/Shell/helper/build.py +++ b/lldb/test/Shell/helper/build.py @@ -804,7 +804,19 @@ def _get_link_command(self): args.extend(self._obj_file_names()) if sys.platform == "darwin": + # By default, macOS doesn't allow injecting the ASAN + # runtime into system processes. + system_clang = ( + subprocess.check_output(["xcrun", "-find", "clang"]) + .strip() + .decode("utf-8") + ) + system_liblto = os.path.join( + os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib" + ) args.extend(["-isysroot", self.apple_sdk]) + args.extend(["-Wl,-lto_library", "-Wl," + system_liblto]) + elif self.objc_gnustep_lib: args.extend(["-L", self.objc_gnustep_lib, "-lobjc"]) if sys.platform == "linux": diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py index 728f6347242f1..faa29d23387cc 100644 --- a/lldb/test/Shell/helper/toolchain.py +++ b/lldb/test/Shell/helper/toolchain.py @@ -250,6 +250,15 @@ def use_support_substitutions(config): "-L{}".format(config.libcxx_libs_dir), "-lc++", ] + # By default, macOS doesn't allow injecting the ASAN runtime into system processes. + if platform.system() in ["Darwin"] and config.llvm_use_sanitizer: + system_clang = ( + subprocess.check_output(["xcrun", "-find", "clang"]).strip().decode("utf-8") + ) + system_liblto = os.path.join( + os.path.dirname(os.path.dirname(system_clang)), "lib", "libLTO.dylib" + ) + host_flags += ["-Wl,-lto_library", "-Wl," + system_liblto] host_flags = " ".join(host_flags) config.substitutions.append(("%clang_host", "%clang " + host_flags)) diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h index 7b1a5f5019589..ee1f28377f7e4 100644 --- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h +++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h @@ -68,17 +68,13 @@ class BasicBlockSectionsProfileReader { BasicBlockSectionsProfileReader() = default; - // Returns true if basic block sections profile exist for function \p - // FuncName. + // Returns true if function \p FuncName is hot based on the basic block + // section profile. bool isFunctionHot(StringRef FuncName) const; - // Returns a pair with first element representing whether basic block sections - // profile exist for the function \p FuncName, and the second element - // representing the basic block sections profile (cluster info) for this - // function. If the first element is true and the second element is empty, it - // means unique basic block sections are desired for all basic blocks of the - // function. - std::pair> + // Returns the cluster info for the function \p FuncName. Returns an empty + // vector if function has no cluster info. + SmallVector getClusterInfoForFunction(StringRef FuncName) const; // Returns the path clonings for the given function. @@ -190,7 +186,7 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass { bool isFunctionHot(StringRef FuncName) const; - std::pair> + SmallVector getClusterInfoForFunction(StringRef FuncName) const; SmallVector> diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 7010cffe23a11..43f28ed79f9dd 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -113,15 +113,18 @@ struct ExtAddrMode { /// class LLVM_ABI TargetInstrInfo : public MCInstrInfo { protected: + const TargetRegisterInfo &TRI; + /// Subtarget specific sub-array of MCInstrInfo's RegClassByHwModeTables /// (i.e. the table for the active HwMode). This should be indexed by /// MCOperandInfo's RegClass field for LookupRegClassByHwMode operands. const int16_t *const RegClassByHwMode; - TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u, - unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u, + TargetInstrInfo(const TargetRegisterInfo &TRI, unsigned CFSetupOpcode = ~0u, + unsigned CFDestroyOpcode = ~0u, unsigned CatchRetOpcode = ~0u, + unsigned ReturnOpcode = ~0u, const int16_t *const RegClassByHwModeTable = nullptr) - : RegClassByHwMode(RegClassByHwModeTable), + : TRI(TRI), RegClassByHwMode(RegClassByHwModeTable), CallFrameSetupOpcode(CFSetupOpcode), CallFrameDestroyOpcode(CFDestroyOpcode), CatchRetOpcode(CatchRetOpcode), ReturnOpcode(ReturnOpcode) {} @@ -131,6 +134,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { TargetInstrInfo &operator=(const TargetInstrInfo &) = delete; virtual ~TargetInstrInfo(); + const TargetRegisterInfo &getRegisterInfo() const { return TRI; } + static bool isGenericOpcode(unsigned Opc) { return Opc <= TargetOpcode::GENERIC_OP_END; } @@ -154,9 +159,8 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// Given a machine instruction descriptor, returns the register /// class constraint for OpNum, or NULL. - virtual const TargetRegisterClass * - getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const; + virtual const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const; /// Returns true if MI is an instruction we are unable to reason about /// (like a call or something with unmodeled side effects). @@ -459,8 +463,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// SubIdx. virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - unsigned SubIdx, const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const; + unsigned SubIdx, const MachineInstr &Orig) const; /// Clones instruction or the whole instruction bundle \p Orig and /// insert into \p MBB before \p InsertBefore. The target may update operands @@ -1193,8 +1196,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// register spill instruction, part of prologue, during the frame lowering. virtual void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const { llvm_unreachable("Target didn't implement " "TargetInstrInfo::storeRegToStackSlot!"); @@ -1212,8 +1214,7 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// register reload instruction, part of epilogue, during the frame lowering. virtual void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const { llvm_unreachable("Target didn't implement " "TargetInstrInfo::loadRegFromStackSlot!"); @@ -1764,17 +1765,6 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { return true; } - /// Return true if it's safe to move a machine instruction. - /// This allows the backend to prevent certain special instruction - /// sequences from being broken by instruction motion in optimization - /// passes. - /// By default, this returns true for every instruction. - virtual bool isSafeToMove(const MachineInstr &MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const { - return true; - } - /// Test if the given instruction should be considered a scheduling boundary. /// This primarily includes labels and terminators. virtual bool isSchedulingBoundary(const MachineInstr &MI, diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h index c1500e00bddd0..7c97afd5a7f5a 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -190,6 +190,16 @@ enum class OMPScheduleType { LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask) }; +/// The fallback types for the dyn_groupprivate clause. +enum class OMPDynGroupprivateFallbackType : uint64_t { + /// Abort the execution. + Abort = 0, + /// Return null pointer. + Null = 1, + /// Allocate from a implementation defined memory space. + DefaultMem = 2 +}; + // Default OpenMP mapper name suffix. inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper"; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 857c5da91a9f5..9c37775af52f8 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -2458,19 +2458,24 @@ class OpenMPIRBuilder { /// The number of threads. ArrayRef NumThreads; /// The size of the dynamic shared memory. - Value *DynCGGroupMem = nullptr; + Value *DynCGroupMem = nullptr; /// True if the kernel has 'no wait' clause. bool HasNoWait = false; + /// The fallback mechanism for the shared memory. + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback = + omp::OMPDynGroupprivateFallbackType::Abort; // Constructors for TargetKernelArgs. TargetKernelArgs() = default; TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, - Value *TripCount, ArrayRef NumTeams, - ArrayRef NumThreads, Value *DynCGGroupMem, - bool HasNoWait) - : NumTargetItems(NumTargetItems), RTArgs(RTArgs), TripCount(TripCount), - NumTeams(NumTeams), NumThreads(NumThreads), - DynCGGroupMem(DynCGGroupMem), HasNoWait(HasNoWait) {} + Value *TripCount , ArrayRef NumTeams, + ArrayRef NumThreads, Value *DynCGroupMem, + bool HasNoWait, + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback) + : NumTargetItems(NumTargetItems), RTArgs(RTArgs), + TripCount(TripCount), NumTeams(NumTeams), + NumThreads(NumThreads), DynCGroupMem(DynCGroupMem), + HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {} }; /// Create the kernel args vector used by emitTargetKernel. This function @@ -3255,6 +3260,10 @@ class OpenMPIRBuilder { /// dependency information as passed in the depend clause /// \param HasNowait Whether the target construct has a `nowait` clause or /// not. + /// \param DynCGroupMem The size of the dynamic groupprivate memory for each + /// cgroup. + /// \param DynCGroupMem The fallback mechanism to execute if the requested + /// cgroup memory cannot be provided. LLVM_ABI InsertPointOrErrorTy createTarget( const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, @@ -3266,7 +3275,10 @@ class OpenMPIRBuilder { TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, - const SmallVector &Dependencies, bool HasNowait = false); + const SmallVector &Dependencies, bool HasNowait = false, + Value *DynCGroupMem = nullptr, + omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback = + omp::OMPDynGroupprivateFallbackType::Abort); /// Returns __kmpc_for_static_init_* runtime function for the specified /// size \a IVSize and sign \a IVSigned. Will create a distribute call diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h index cb8e568de02e0..5a012cf0c0264 100644 --- a/llvm/include/llvm/Support/SpecialCaseList.h +++ b/llvm/include/llvm/Support/SpecialCaseList.h @@ -12,19 +12,11 @@ #ifndef LLVM_SUPPORT_SPECIALCASELIST_H #define LLVM_SUPPORT_SPECIALCASELIST_H -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/RadixTree.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringMap.h" -#include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/GlobPattern.h" -#include "llvm/Support/Regex.h" +#include "llvm/Support/Error.h" #include #include #include -#include #include namespace llvm { @@ -125,93 +117,20 @@ class SpecialCaseList { SpecialCaseList(SpecialCaseList const &) = delete; SpecialCaseList &operator=(SpecialCaseList const &) = delete; -private: - // Lagacy v1 matcher. - class RegexMatcher { + class Section { public: - LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); - LLVM_ABI void preprocess(bool BySize); - - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; - - struct Reg { - Reg(StringRef Name, unsigned LineNo, Regex &&Rg) - : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {} - StringRef Name; - unsigned LineNo; - Regex Rg; - }; - - std::vector RegExes; - }; - - class GlobMatcher { - public: - LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); - LLVM_ABI void preprocess(bool BySize); - - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; - - struct Glob { - Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern) - : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {} - StringRef Name; - unsigned LineNo; - GlobPattern Pattern; - }; - - std::vector Globs; - - RadixTree, - RadixTree, - SmallVector>> - PrefixSuffixToGlob; - - RadixTree, - SmallVector> - SubstrToGlob; - }; - - /// Represents a set of patterns and their line numbers - class Matcher { - public: - LLVM_ABI Matcher(bool UseGlobs, bool RemoveDotSlash); - - LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber); - LLVM_ABI void preprocess(bool BySize); - - LLVM_ABI void - match(StringRef Query, - llvm::function_ref Cb) const; + LLVM_ABI Section(StringRef Name, unsigned FileIdx, bool UseGlobs); + LLVM_ABI Section(Section &&); + LLVM_ABI ~Section(); - LLVM_ABI bool matchAny(StringRef Query) const { - bool R = false; - match(Query, [&](StringRef, unsigned) { R = true; }); - return R; - } + // Returns name of the section, its entire string in []. + StringRef name() const { return Name; } - std::variant M; - bool RemoveDotSlash; - }; - - using SectionEntries = StringMap>; + // Returns true if string 'Name' matches section name interpreted as a glob. + LLVM_ABI bool matchName(StringRef Name) const; -protected: - struct Section { - Section(StringRef Str, unsigned FileIdx, bool UseGlobs) - : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false), SectionStr(Str), - FileIdx(FileIdx) {} - - Section(Section &&) = default; - - Matcher SectionMatcher; - SectionEntries Entries; - std::string SectionStr; - unsigned FileIdx; + // Returns sequence number of the file where this section is defined. + unsigned fileIndex() const { return FileIdx; } // Helper method to search by Prefix, Query, and Category. Returns // 1-based line number on which rule is defined, or 0 if there is no match. @@ -223,11 +142,16 @@ class SpecialCaseList { LLVM_ABI StringRef getLongestMatch(StringRef Prefix, StringRef Query, StringRef Category) const; + /// Returns true if the section has any entries for the given prefix. + LLVM_ABI bool hasPrefix(StringRef Prefix) const; + private: friend class SpecialCaseList; - LLVM_ABI void preprocess(bool OrderBySize); - LLVM_ABI const SpecialCaseList::Matcher * - findMatcher(StringRef Prefix, StringRef Category) const; + class SectionImpl; + + StringRef Name; + unsigned FileIdx; + std::unique_ptr Impl; }; ArrayRef sections() const { return Sections; } diff --git a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp index f31d625eca14c..9d53c37461ba8 100644 --- a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp +++ b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp @@ -43,13 +43,19 @@ static void printModuleDebugInfo(raw_ostream &O, const Module *M, // filenames), so just print a few useful things. for (DICompileUnit *CU : Finder.compile_units()) { O << "Compile unit: "; - auto Lang = - dwarf::LanguageString(CU->getSourceLanguage().getUnversionedName()); - if (!Lang.empty()) - O << Lang; + + DISourceLanguageName Lang = CU->getSourceLanguage(); + auto LangStr = + Lang.hasVersionedName() + ? dwarf::SourceLanguageNameString( + static_cast(Lang.getName())) + : dwarf::LanguageString(Lang.getName()); + + if (!LangStr.empty()) + O << LangStr; else - O << "unknown-language(" << CU->getSourceLanguage().getUnversionedName() - << ")"; + O << "unknown-language(" << CU->getSourceLanguage().getName() << ")"; + printFile(O, CU->getFilename(), CU->getDirectory()); O << '\n'; } diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 6567bd403c857..46b5bb7908227 100644 --- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -395,7 +395,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction( // Note register reference... const TargetRegisterClass *RC = nullptr; if (i < MI.getDesc().getNumOperands()) - RC = TII->getRegClass(MI.getDesc(), i, TRI); + RC = TII->getRegClass(MI.getDesc(), i); AggressiveAntiDepState::RegisterReference RR = { &MO, RC }; RegRefs.emplace(Reg.asMCReg(), RR); } @@ -479,7 +479,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI, // Note register reference... const TargetRegisterClass *RC = nullptr; if (i < MI.getDesc().getNumOperands()) - RC = TII->getRegClass(MI.getDesc(), i, TRI); + RC = TII->getRegClass(MI.getDesc(), i); AggressiveAntiDepState::RegisterReference RR = { &MO, RC }; RegRefs.emplace(Reg.asMCReg(), RR); } diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp index e317e1c06741f..52e2909bec072 100644 --- a/llvm/lib/CodeGen/BasicBlockSections.cpp +++ b/llvm/lib/CodeGen/BasicBlockSections.cpp @@ -183,8 +183,7 @@ updateBranches(MachineFunction &MF, // clusters are ordered in increasing order of their IDs, with the "Exception" // and "Cold" succeeding all other clusters. // FuncClusterInfo represents the cluster information for basic blocks. It -// maps from BBID of basic blocks to their cluster information. If this is -// empty, it means unique sections for all basic blocks in the function. +// maps from BBID of basic blocks to their cluster information. static void assignSections(MachineFunction &MF, const DenseMap &FuncClusterInfo) { @@ -197,10 +196,8 @@ assignSections(MachineFunction &MF, for (auto &MBB : MF) { // With the 'all' option, every basic block is placed in a unique section. // With the 'list' option, every basic block is placed in a section - // associated with its cluster, unless we want individual unique sections - // for every basic block in this function (if FuncClusterInfo is empty). - if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All || - FuncClusterInfo.empty()) { + // associated with its cluster. + if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All) { // If unique sections are desired for all basic blocks of the function, we // set every basic block's section ID equal to its original position in // the layout (which is equal to its number). This ensures that basic @@ -308,22 +305,22 @@ bool BasicBlockSections::handleBBSections(MachineFunction &MF) { if (BBSectionsType == BasicBlockSection::List && hasInstrProfHashMismatch(MF)) return false; - // Renumber blocks before sorting them. This is useful for accessing the - // original layout positions and finding the original fallthroughs. - MF.RenumberBlocks(); DenseMap FuncClusterInfo; if (BBSectionsType == BasicBlockSection::List) { - auto [HasProfile, ClusterInfo] = - getAnalysis() - .getClusterInfoForFunction(MF.getName()); - if (!HasProfile) + auto ClusterInfo = getAnalysis() + .getClusterInfoForFunction(MF.getName()); + if (ClusterInfo.empty()) return false; for (auto &BBClusterInfo : ClusterInfo) { FuncClusterInfo.try_emplace(BBClusterInfo.BBID, BBClusterInfo); } } + // Renumber blocks before sorting them. This is useful for accessing the + // original layout positions and finding the original fallthroughs. + MF.RenumberBlocks(); + MF.setBBSectionsType(BBSectionsType); assignSections(MF, FuncClusterInfo); diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index 485b44ae4c4aa..c234c0f1b0b34 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -58,22 +58,24 @@ BasicBlockSectionsProfileReader::parseUniqueBBID(StringRef S) const { } bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const { - return getClusterInfoForFunction(FuncName).first; + return !getClusterInfoForFunction(FuncName).empty(); } -std::pair> +SmallVector BasicBlockSectionsProfileReader::getClusterInfoForFunction( StringRef FuncName) const { auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); - return R != ProgramPathAndClusterInfo.end() - ? std::pair(true, R->second.ClusterInfo) - : std::pair(false, SmallVector()); + return R != ProgramPathAndClusterInfo.end() ? R->second.ClusterInfo + : SmallVector(); } SmallVector> BasicBlockSectionsProfileReader::getClonePathsForFunction( StringRef FuncName) const { - return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).ClonePaths; + auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); + return R != ProgramPathAndClusterInfo.end() + ? R->second.ClonePaths + : SmallVector>(); } uint64_t BasicBlockSectionsProfileReader::getEdgeCount( @@ -494,7 +496,7 @@ bool BasicBlockSectionsProfileReaderWrapperPass::isFunctionHot( return BBSPR.isFunctionHot(FuncName); } -std::pair> +SmallVector BasicBlockSectionsProfileReaderWrapperPass::getClusterInfoForFunction( StringRef FuncName) const { return BBSPR.getClusterInfoForFunction(FuncName); diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index af1625a209569..7292bc2be0df2 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -1979,7 +1979,6 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { MachineBasicBlock::iterator FIB = FBB->begin(); MachineBasicBlock::iterator TIE = TBB->end(); MachineBasicBlock::iterator FIE = FBB->end(); - MachineFunction &MF = *MBB->getParent(); while (TIB != TIE && FIB != FIE) { // Skip dbg_value instructions. These do not count. TIB = skipDebugInstructionsForward(TIB, TIE, false); @@ -1994,10 +1993,6 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { // Hard to reason about register liveness with predicated instruction. break; - if (!TII->isSafeToMove(*TIB, MBB, MF)) - // Don't hoist the instruction if it isn't safe to move. - break; - bool IsSafe = true; for (MachineOperand &MO : TIB->operands()) { // Don't attempt to hoist instructions with register masks. diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp index 1846880b0c181..fead3ee250841 100644 --- a/llvm/lib/CodeGen/BreakFalseDeps.cpp +++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp @@ -133,7 +133,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, } // Get the undef operand's register class - const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx, TRI); + const TargetRegisterClass *OpRC = TII->getRegClass(MI->getDesc(), OpIdx); assert(OpRC && "Not a valid register class"); // If the instruction has a true dependency, we can hide the false depdency diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index 86377cff2d29d..3259a3e83c541 100644 --- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -187,7 +187,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) { const TargetRegisterClass *NewRC = nullptr; if (i < MI.getDesc().getNumOperands()) - NewRC = TII->getRegClass(MI.getDesc(), i, TRI); + NewRC = TII->getRegClass(MI.getDesc(), i); // For now, only allow the register to be changed if its register // class is consistent across all uses. @@ -316,7 +316,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) { const TargetRegisterClass *NewRC = nullptr; if (i < MI.getDesc().getNumOperands()) - NewRC = TII->getRegClass(MI.getDesc(), i, TRI); + NewRC = TII->getRegClass(MI.getDesc(), i); // For now, only allow the register to be changed if its register // class is consistent across all uses. diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp index 8b74dcebd00ac..c23cac7974d51 100644 --- a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp +++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp @@ -420,7 +420,7 @@ class StatepointState { LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore); TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI, - RC, &TRI, Register()); + RC, Register()); } } @@ -429,7 +429,7 @@ class StatepointState { const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); int FI = RegToSlotIdx[Reg]; if (It != MBB->end()) { - TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register()); + TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, Register()); return; } @@ -437,7 +437,7 @@ class StatepointState { // and then swap them. assert(!MBB->empty() && "Empty block"); --It; - TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register()); + TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, Register()); MachineInstr *Reload = It->getPrevNode(); int Dummy = 0; (void)Dummy; diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 5fab6ec506e94..e8954a3d9899b 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -114,7 +114,7 @@ Register llvm::constrainOperandRegClass( // Assume physical registers are properly constrained. assert(Reg.isVirtual() && "PhysReg not implemented"); - const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx, &TRI); + const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx); // Some of the target independent instructions, like COPY, may not impose any // register class constraints on some of their operands: If it's a use, we can // skip constraining as the instruction defining the register would constrain diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp index e07e598019709..12b36f56d4d9a 100644 --- a/llvm/lib/CodeGen/InitUndef.cpp +++ b/llvm/lib/CodeGen/InitUndef.cpp @@ -232,7 +232,7 @@ bool InitUndef::processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB, MachineOperand &UseMO = MI.getOperand(UseOpIdx); if (UseMO.getReg() == MCRegister::NoRegister) { const TargetRegisterClass *RC = - TII->getRegClass(MI.getDesc(), UseOpIdx, TRI); + TII->getRegClass(MI.getDesc(), UseOpIdx); Register NewDest = MRI->createVirtualRegister(RC); // We don't have a way to update dead lanes, so keep track of the // new register so that we avoid querying it later. diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index c3e0964594bd5..68370303a3aef 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -473,7 +473,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI, MachineInstrSpan MIS(MII, MBB); // Insert spill without kill flag immediately after def. TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot, - MRI.getRegClass(SrcReg), &TRI, Register()); + MRI.getRegClass(SrcReg), Register()); LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII); for (const MachineInstr &MI : make_range(MIS.begin(), MII)) getVDefInterval(MI, LIS); @@ -1119,7 +1119,7 @@ void InlineSpiller::insertReload(Register NewVReg, MachineInstrSpan MIS(MI, &MBB); TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot, - MRI.getRegClass(NewVReg), &TRI, Register()); + MRI.getRegClass(NewVReg), Register()); LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI); @@ -1155,7 +1155,7 @@ void InlineSpiller::insertSpill(Register NewVReg, bool isKill, if (IsRealSpill) TII.storeRegToStackSlot(MBB, SpillBefore, NewVReg, isKill, StackSlot, - MRI.getRegClass(NewVReg), &TRI, Register()); + MRI.getRegClass(NewVReg), Register()); else // Don't spill undef value. // Anything works for undef, in particular keeping the memory @@ -1729,7 +1729,7 @@ void HoistSpillHelper::hoistAllSpills() { MachineBasicBlock::iterator MII = IPA.getLastInsertPointIter(OrigLI, *BB); MachineInstrSpan MIS(MII, BB); TII.storeRegToStackSlot(*BB, MII, LiveReg, false, Slot, - MRI.getRegClass(LiveReg), &TRI, Register()); + MRI.getRegClass(LiveReg), Register()); LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII); for (const MachineInstr &MI : make_range(MIS.begin(), MII)) getVDefInterval(MI, LIS); diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 5b0365da4e8c6..6fe11704a9137 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -88,7 +88,7 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB, bool Late, unsigned SubIdx, MachineInstr *ReplaceIndexMI) { assert(RM.OrigMI && "Invalid remat"); - TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI, tri); + TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI); // DestReg of the cloned instruction cannot be Dead. Set isDead of DestReg // to false anyway in case the isDead flag of RM.OrigMI's dest register // is true. diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index c3b537605932b..5014ac7cdd58f 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -982,7 +982,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx, assert(getMF() && "Can't have an MF reference here!"); // Most opcodes have fixed constraints in their MCInstrDesc. if (!isInlineAsm()) - return TII->getRegClass(getDesc(), OpIdx, TRI); + return TII->getRegClass(getDesc(), OpIdx); if (!getOperand(OpIdx).isReg()) return nullptr; diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 729e73c8c312c..c169467384f8b 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -1399,7 +1399,7 @@ MachineInstr *MachineLICMImpl::ExtractHoistableLoad(MachineInstr *MI, if (NewOpc == 0) return nullptr; const MCInstrDesc &MID = TII->get(NewOpc); MachineFunction &MF = *MI->getMF(); - const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI); + const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex); // Ok, we're unfolding. Create a temporary register and do the unfold. Register Reg = MRI->createVirtualRegister(RC); diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 94ed82eee9b8f..0ceeda4eb16d2 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -569,7 +569,7 @@ bool MachineSinking::PerformSinkAndFold(MachineInstr &MI, // Sink a copy of the instruction, replacing a COPY instruction. MachineBasicBlock::iterator InsertPt = SinkDst->getIterator(); Register DstReg = SinkDst->getOperand(0).getReg(); - TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI, *TRI); + TII->reMaterialize(*SinkDst->getParent(), InsertPt, DstReg, 0, MI); New = &*std::prev(InsertPt); if (!New->getDebugLoc()) New->setDebugLoc(SinkDst->getDebugLoc()); diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index fdf10480b6e05..013f52938b65c 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -2657,8 +2657,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { return; } if (MONum < MCID.getNumOperands()) { - if (const TargetRegisterClass *DRC = - TII->getRegClass(MCID, MONum, TRI)) { + if (const TargetRegisterClass *DRC = TII->getRegClass(MCID, MONum)) { if (!DRC->contains(Reg)) { report("Illegal physical register for instruction", MO, MONum); OS << printReg(Reg, TRI) << " is not a " @@ -2742,12 +2741,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { // has register class constraint, the virtual register must // comply to it. if (!isPreISelGenericOpcode(MCID.getOpcode()) && - MONum < MCID.getNumOperands() && - TII->getRegClass(MCID, MONum, TRI)) { + MONum < MCID.getNumOperands() && TII->getRegClass(MCID, MONum)) { report("Virtual register does not match instruction constraint", MO, MONum); OS << "Expect register class " - << TRI->getRegClassName(TII->getRegClass(MCID, MONum, TRI)) + << TRI->getRegClassName(TII->getRegClass(MCID, MONum)) << " but got nothing\n"; return; } @@ -2773,8 +2771,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { } } if (MONum < MCID.getNumOperands()) { - if (const TargetRegisterClass *DRC = - TII->getRegClass(MCID, MONum, TRI)) { + if (const TargetRegisterClass *DRC = TII->getRegClass(MCID, MONum)) { if (SubIdx) { const TargetRegisterClass *SuperRC = TRI->getLargestLegalSuperClass(RC, *MF); diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 7c7e881fcf48f..3b72bd2f0cd91 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -594,8 +594,7 @@ void RegAllocFastImpl::spill(MachineBasicBlock::iterator Before, LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n'); const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI, - VirtReg); + TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, VirtReg); ++NumStores; MachineBasicBlock::iterator FirstTerm = MBB->getFirstTerminator(); @@ -652,7 +651,7 @@ void RegAllocFastImpl::reload(MachineBasicBlock::iterator Before, << printReg(PhysReg, TRI) << '\n'); int FI = getStackSpaceFor(VirtReg); const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI, VirtReg); + TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, VirtReg); ++NumLoads; } @@ -1124,7 +1123,7 @@ bool RegAllocFastImpl::defineVirtReg(MachineInstr &MI, unsigned OpNum, if (MO.isMBB()) { MachineBasicBlock *Succ = MO.getMBB(); TII->storeRegToStackSlot(*Succ, Succ->begin(), PhysReg, Kill, FI, - &RC, TRI, VirtReg); + &RC, VirtReg); ++NumStores; Succ->addLiveIn(PhysReg); } diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index f93a7f22c3961..005e44fc7080b 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1374,7 +1374,7 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, } const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg(); - const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI); + const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0); if (!DefMI->isImplicitDef()) { if (DstReg.isPhysical()) { Register NewDstReg = DstReg; diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp index 7e26c2ed59949..d8861672a348f 100644 --- a/llvm/lib/CodeGen/RegisterScavenging.cpp +++ b/llvm/lib/CodeGen/RegisterScavenging.cpp @@ -276,14 +276,14 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj, ": Cannot scavenge register without an emergency " "spill slot!"); } - TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, TRI, Register()); + TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, Register()); MachineBasicBlock::iterator II = std::prev(Before); unsigned FIOperandNum = getFrameIndexOperandNum(*II); TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this); // Restore the scavenged register before its use (or first terminator). - TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, TRI, Register()); + TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, Register()); II = std::prev(UseMI); FIOperandNum = getFrameIndexOperandNum(*II); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 768d03522d7c8..21f973d5b3a6b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10988,6 +10988,22 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { } } + // fold (sra (xor (sra x, c1), -1), c2) -> (xor (sra x, c3), -1) + // This allows merging two arithmetic shifts even when there's a NOT in + // between. + SDValue X; + APInt C1; + if (N1C && sd_match(N0, m_OneUse(m_Not( + m_OneUse(m_Sra(m_Value(X), m_ConstInt(C1))))))) { + APInt C2 = N1C->getAPIntValue(); + zeroExtendToMatch(C1, C2, 1 /* Overflow Bit */); + APInt Sum = C1 + C2; + unsigned ShiftSum = Sum.getLimitedValue(OpSizeInBits - 1); + SDValue NewShift = DAG.getNode( + ISD::SRA, DL, VT, X, DAG.getShiftAmountConstant(ShiftSum, VT, DL)); + return DAG.getNOT(DL, NewShift, VT); + } + // fold (sra (shl X, m), (sub result_size, n)) // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for // result_size - n != m. diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 507b2d61a534c..5c84059da273b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1965,7 +1965,7 @@ Register FastISel::createResultReg(const TargetRegisterClass *RC) { Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op, unsigned OpNum) { if (Op.isVirtual()) { - const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum, &TRI); + const TargetRegisterClass *RegClass = TII.getRegClass(II, OpNum); if (!MRI.constrainRegClass(Op, RegClass)) { // If it's not legal to COPY between the register classes, something // has gone very wrong before we got here. diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 1ae6f3503fd00..195d3b6625a82 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -125,7 +125,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg, const TargetRegisterClass *RC = nullptr; if (i + II.getNumDefs() < II.getNumOperands()) { RC = TRI->getAllocatableClass( - TII->getRegClass(II, i + II.getNumDefs(), TRI)); + TII->getRegClass(II, i + II.getNumDefs())); } if (!UseRC) UseRC = RC; @@ -197,7 +197,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, // register instead of creating a new vreg. Register VRBase; const TargetRegisterClass *RC = - TRI->getAllocatableClass(TII->getRegClass(II, i, TRI)); + TRI->getAllocatableClass(TII->getRegClass(II, i)); // Always let the value type influence the used register class. The // constraints on the instruction may be too lax to represent the value // type correctly. For example, a 64-bit float (X86::FR64) can't live in @@ -330,7 +330,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB, if (II) { const TargetRegisterClass *OpRC = nullptr; if (IIOpNum < II->getNumOperands()) - OpRC = TII->getRegClass(*II, IIOpNum, TRI); + OpRC = TII->getRegClass(*II, IIOpNum); if (OpRC) { unsigned MinNumRegs = MinRCSize; @@ -409,8 +409,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB, SDValue Op, Register VReg = R->getReg(); MVT OpVT = Op.getSimpleValueType(); const TargetRegisterClass *IIRC = - II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI)) - : nullptr; + II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum)) : nullptr; const TargetRegisterClass *OpRC = TLI->isTypeLegal(OpVT) ? TLI->getRegClassFor(OpVT, diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index f70b6cddcc099..12fc26d949581 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -340,7 +340,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos, unsigned Idx = RegDefPos.GetIdx(); const MCInstrDesc &Desc = TII->get(Opcode); - const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx); assert(RC && "Not a valid register class"); RegClass = RC->getID(); // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp index f9ecb2c97b2e0..8ec4bfbb5a330 100644 --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -1509,10 +1509,9 @@ void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) { } // Trace value through phis. - SmallPtrSet Visited; ///< whether VNI was/is in worklist. - SmallVector WorkList; - Visited.insert(&ParentVNI); - WorkList.push_back(&ParentVNI); + ///< whether VNI was/is in worklist. + SmallPtrSet Visited = {&ParentVNI}; + SmallVector WorkList = {&ParentVNI}; const LiveInterval &ParentLI = Edit->getParent(); const SlotIndexes &Indexes = *LIS.getSlotIndexes(); diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index b925ecb4437bc..a3033dfa48f4d 100644 --- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -227,7 +227,7 @@ void TargetFrameLowering::spillCalleeSavedRegister( } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC, - TRI, Register()); + Register()); } } @@ -241,8 +241,7 @@ void TargetFrameLowering::restoreCalleeSavedRegister( .addReg(CS.getDstReg(), getKillRegState(true)); } else { const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, - Register()); + TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register()); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } } diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 3c41bbeb4b327..d503d7a2345fd 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -58,9 +58,8 @@ static cl::opt MaxAccumulatorWidth( TargetInstrInfo::~TargetInstrInfo() = default; -const TargetRegisterClass * -TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { +const TargetRegisterClass *TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const { if (OpNum >= MCID.getNumOperands()) return nullptr; @@ -69,14 +68,14 @@ TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, // TODO: Remove isLookupPtrRegClass in favor of isLookupRegClassByHwMode if (OpInfo.isLookupPtrRegClass()) - return TRI->getPointerRegClass(RegClass); + return TRI.getPointerRegClass(RegClass); // Instructions like INSERT_SUBREG do not have fixed register classes. if (RegClass < 0) return nullptr; // Otherwise just look it up normally. - return TRI->getRegClass(RegClass); + return TRI.getRegClass(RegClass); } /// insertNoop - Insert a noop into the instruction stream at the specified @@ -223,13 +222,11 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI, // %1.sub = INST %1.sub(tied), %0.sub, implicit-def %1 SmallVector UpdateImplicitDefIdx; if (HasDef && MI.hasImplicitDef()) { - const TargetRegisterInfo *TRI = - MI.getMF()->getSubtarget().getRegisterInfo(); for (auto [OpNo, MO] : llvm::enumerate(MI.implicit_operands())) { Register ImplReg = MO.getReg(); if ((ImplReg.isVirtual() && ImplReg == Reg0) || (ImplReg.isPhysical() && Reg0.isPhysical() && - TRI->isSubRegisterEq(ImplReg, Reg0))) + TRI.isSubRegisterEq(ImplReg, Reg0))) UpdateImplicitDefIdx.push_back(OpNo + MI.getNumExplicitOperands()); } } @@ -425,28 +422,27 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx, unsigned &Size, unsigned &Offset, const MachineFunction &MF) const { - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (!SubIdx) { - Size = TRI->getSpillSize(*RC); + Size = TRI.getSpillSize(*RC); Offset = 0; return true; } - unsigned BitSize = TRI->getSubRegIdxSize(SubIdx); + unsigned BitSize = TRI.getSubRegIdxSize(SubIdx); // Convert bit size to byte size. if (BitSize % 8) return false; - int BitOffset = TRI->getSubRegIdxOffset(SubIdx); + int BitOffset = TRI.getSubRegIdxOffset(SubIdx); if (BitOffset < 0 || BitOffset % 8) return false; Size = BitSize / 8; Offset = (unsigned)BitOffset / 8; - assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); + assert(TRI.getSpillSize(*RC) >= (Offset + Size) && "bad subregister range"); if (!MF.getDataLayout().isLittleEndian()) { - Offset = TRI->getSpillSize(*RC) - (Offset + Size); + Offset = TRI.getSpillSize(*RC) - (Offset + Size); } return true; } @@ -454,8 +450,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { + const MachineInstr &Orig) const { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI); MBB.insert(I, MI); @@ -726,7 +721,6 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, // actual load size is. int64_t MemSize = 0; const MachineFrameInfo &MFI = MF.getFrameInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); if (Flags & MachineMemOperand::MOStore) { MemSize = MFI.getObjectSize(FI); @@ -735,7 +729,7 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, int64_t OpSize = MFI.getObjectSize(FI); if (auto SubReg = MI.getOperand(OpIdx).getSubReg()) { - unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg); + unsigned SubRegSize = TRI.getSubRegIdxSize(SubReg); if (SubRegSize > 0 && !(SubRegSize % 8)) OpSize = SubRegSize / 8; } @@ -800,11 +794,11 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, // code. BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO); } else { - storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI, + storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, Register()); } } else - loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register()); + loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, Register()); return &*--Pos; } @@ -880,8 +874,8 @@ static void transferImplicitOperands(MachineInstr *MI, } } -void TargetInstrInfo::lowerCopy(MachineInstr *MI, - const TargetRegisterInfo *TRI) const { +void TargetInstrInfo::lowerCopy( + MachineInstr *MI, const TargetRegisterInfo * /*Remove me*/) const { if (MI->allDefsAreDead()) { MI->setDesc(get(TargetOpcode::KILL)); return; @@ -911,7 +905,7 @@ void TargetInstrInfo::lowerCopy(MachineInstr *MI, SrcMO.getReg().isPhysical() ? SrcMO.isRenamable() : false); if (MI->getNumOperands() > 2) - transferImplicitOperands(MI, TRI); + transferImplicitOperands(MI, &TRI); MI->eraseFromParent(); } @@ -1327,8 +1321,7 @@ void TargetInstrInfo::reassociateOps( MachineFunction *MF = Root.getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, &TRI); MachineOperand &OpA = Prev.getOperand(OperandIndices[1]); MachineOperand &OpB = Root.getOperand(OperandIndices[2]); @@ -1337,9 +1330,12 @@ void TargetInstrInfo::reassociateOps( MachineOperand &OpC = Root.getOperand(0); Register RegA = OpA.getReg(); + unsigned SubRegA = OpA.getSubReg(); Register RegB = OpB.getReg(); Register RegX = OpX.getReg(); + unsigned SubRegX = OpX.getSubReg(); Register RegY = OpY.getReg(); + unsigned SubRegY = OpY.getSubReg(); Register RegC = OpC.getReg(); if (RegA.isVirtual()) @@ -1357,6 +1353,7 @@ void TargetInstrInfo::reassociateOps( // recycling RegB because the MachineCombiner's computation of the critical // path requires a new register definition rather than an existing one. Register NewVR = MRI.createVirtualRegister(RC); + unsigned SubRegNewVR = 0; InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); auto [NewRootOpc, NewPrevOpc] = getReassociationOpcodes(Pattern, Root, Prev); @@ -1369,6 +1366,7 @@ void TargetInstrInfo::reassociateOps( if (SwapPrevOperands) { std::swap(RegX, RegY); + std::swap(SubRegX, SubRegY); std::swap(KillX, KillY); } @@ -1421,9 +1419,9 @@ void TargetInstrInfo::reassociateOps( if (Idx == 0) continue; if (Idx == PrevFirstOpIdx) - MIB1.addReg(RegX, getKillRegState(KillX)); + MIB1.addReg(RegX, getKillRegState(KillX), SubRegX); else if (Idx == PrevSecondOpIdx) - MIB1.addReg(RegY, getKillRegState(KillY)); + MIB1.addReg(RegY, getKillRegState(KillY), SubRegY); else MIB1.add(MO); } @@ -1431,6 +1429,7 @@ void TargetInstrInfo::reassociateOps( if (SwapRootOperands) { std::swap(RegA, NewVR); + std::swap(SubRegA, SubRegNewVR); std::swap(KillA, KillNewVR); } @@ -1442,9 +1441,9 @@ void TargetInstrInfo::reassociateOps( if (Idx == 0) continue; if (Idx == RootFirstOpIdx) - MIB2 = MIB2.addReg(RegA, getKillRegState(KillA)); + MIB2 = MIB2.addReg(RegA, getKillRegState(KillA), SubRegA); else if (Idx == RootSecondOpIdx) - MIB2 = MIB2.addReg(NewVR, getKillRegState(KillNewVR)); + MIB2 = MIB2.addReg(NewVR, getKillRegState(KillNewVR), SubRegNewVR); else MIB2 = MIB2.add(MO); } @@ -1532,6 +1531,7 @@ void TargetInstrInfo::genAlternativeCodeSequence( if (IndexedReg.index() == 0) continue; + // FIXME: Losing subregisters MachineInstr *Instr = MRI.getUniqueVRegDef(IndexedReg.value()); MachineInstrBuilder MIB; Register AccReg; @@ -1704,8 +1704,7 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI, // stack slot reference to depend on the instruction that does the // modification. const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI); + return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), &TRI); } // Provide a global flag for disabling the PreRA hazard recognizer that targets @@ -1738,11 +1737,11 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, // Default implementation of getMemOperandWithOffset. bool TargetInstrInfo::getMemOperandWithOffset( const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, - bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const { + bool &OffsetIsScalable, const TargetRegisterInfo * /*RemoveMe*/) const { SmallVector BaseOps; LocationSize Width = LocationSize::precise(0); if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable, - Width, TRI) || + Width, &TRI) || BaseOps.size() != 1) return false; BaseOp = BaseOps.front(); @@ -1863,7 +1862,6 @@ std::optional TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const { const MachineFunction *MF = MI.getMF(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {}); int64_t Offset; bool OffsetIsScalable; @@ -1894,7 +1892,6 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, // Only describe memory which provably does not escape the function. As // described in llvm.org/PR43343, escaped memory may be clobbered by the // callee (or by another thread). - const auto &TII = MF->getSubtarget().getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); const MachineMemOperand *MMO = MI.memoperands()[0]; const PseudoSourceValue *PSV = MMO->getPseudoValue(); @@ -1905,8 +1902,7 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI, return std::nullopt; const MachineOperand *BaseOp; - if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, - TRI)) + if (!getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, &TRI)) return std::nullopt; // FIXME: Scalable offsets are not yet handled in the offset code below. @@ -2045,7 +2041,7 @@ bool TargetInstrInfo::getInsertSubregInputs( // Returns a MIRPrinter comment for this machine operand. std::string TargetInstrInfo::createMIROperandComment( const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx, - const TargetRegisterInfo *TRI) const { + const TargetRegisterInfo * /*RemoveMe*/) const { if (!MI.isInlineAsm()) return ""; @@ -2078,12 +2074,8 @@ std::string TargetInstrInfo::createMIROperandComment( OS << F.getKindName(); unsigned RCID; - if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) { - if (TRI) { - OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID)); - } else - OS << ":RC" << RCID; - } + if (!F.isImmKind() && !F.isMemKind() && F.hasRegClassConstraint(RCID)) + OS << ':' << TRI.getRegClassName(TRI.getRegClass(RCID)); if (F.isMemKind()) { InlineAsm::ConstraintCode MCID = F.getMemoryConstraintID(); diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index b99e1c7f19b71..3f2961cd83bab 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1402,7 +1402,7 @@ bool TwoAddressInstructionImpl::tryInstructionTransform( // Unfold the load. LLVM_DEBUG(dbgs() << "2addr: UNFOLDING: " << MI); const TargetRegisterClass *RC = TRI->getAllocatableClass( - TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI)); + TII->getRegClass(UnfoldMCID, LoadRegIndex)); Register Reg = MRI->createVirtualRegister(RC); SmallVector NewMIs; if (!TII->unfoldMemoryOperand(*MF, MI, Reg, diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 377e180d1d044..4e00daf50c147 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -538,7 +538,13 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, auto Int32Ty = Type::getInt32Ty(Builder.getContext()); constexpr size_t MaxDim = 3; Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim)); - Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); + + Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait); + + Value *DynCGroupMemFallbackFlag = + Builder.getInt64(static_cast(KernelArgs.DynCGroupMemFallback)); + DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2); + Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag); assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty()); @@ -567,7 +573,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, Flags, NumTeams3D, NumThreads3D, - KernelArgs.DynCGGroupMem}; + KernelArgs.DynCGroupMem}; } void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { @@ -8331,7 +8337,8 @@ static void emitTargetCall( OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector &Dependencies, - bool HasNoWait) { + bool HasNoWait, Value *DynCGroupMem, + OMPDynGroupprivateFallbackType DynCGroupMemFallback) { // Generate a function call to the host fallback implementation of the target // region. This is called by the host when no offload entry was generated for // the target region and when the offloading call fails at runtime. @@ -8467,12 +8474,13 @@ static void emitTargetCall( /*isSigned=*/false) : Builder.getInt64(0); - // TODO: Use correct DynCGGroupMem - Value *DynCGGroupMem = Builder.getInt32(0); + // Request zero groupprivate bytes by default. + if (!DynCGroupMem) + DynCGroupMem = Builder.getInt32(0); - KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount, - NumTeamsC, NumThreadsC, - DynCGGroupMem, HasNoWait); + KArgs = OpenMPIRBuilder::TargetKernelArgs( + NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem, + HasNoWait, DynCGroupMemFallback); // Assume no error was returned because TaskBodyCB and // EmitTargetCallFallbackCB don't produce any. @@ -8521,7 +8529,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, - const SmallVector &Dependencies, bool HasNowait) { + const SmallVector &Dependencies, bool HasNowait, + Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) { if (!updateToLocation(Loc)) return InsertPointTy(); @@ -8544,7 +8553,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget( if (!Config.isTargetDevice()) emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB, - CustomMapperCB, Dependencies, HasNowait); + CustomMapperCB, Dependencies, HasNowait, DynCGroupMem, + DynCGroupMemFallback); return Builder.saveIP(); } @@ -8579,6 +8589,16 @@ OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, // variable for possibly changing that to internal or private, or maybe // create different versions of the function for different OMP internal // variables. + const DataLayout &DL = M.getDataLayout(); + // TODO: Investigate why AMDGPU expects AS 0 for globals even though the + // default global AS is 1. + // See double-target-call-with-declare-target.f90 and + // declare-target-vars-in-target-region.f90 libomptarget + // tests. + unsigned AddressSpaceVal = AddressSpace ? AddressSpace + : M.getTargetTriple().isAMDGPU() + ? 0 + : DL.getDefaultGlobalsAddressSpace(); auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32 ? GlobalValue::InternalLinkage : GlobalValue::CommonLinkage; @@ -8586,7 +8606,6 @@ OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, Constant::getNullValue(Ty), Elem.first(), /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, AddressSpace); - const DataLayout &DL = M.getDataLayout(); const llvm::Align TypeAlign = DL.getABITypeAlign(Ty); const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace); GV->setAlignment(std::max(TypeAlign, PtrAlign)); diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 185f2e6464585..0896b3f70a29e 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -228,7 +228,7 @@ static cl::opt EnableLoopHeaderDuplication( static cl::opt EnableDFAJumpThreading("enable-dfa-jump-thread", cl::desc("Enable DFA jump threading"), - cl::init(true), cl::Hidden); + cl::init(false), cl::Hidden); static cl::opt EnableHotColdSplit("hot-cold-split", diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp index 246d90cce3a43..91f98cf7fac6c 100644 --- a/llvm/lib/Support/SpecialCaseList.cpp +++ b/llvm/lib/Support/SpecialCaseList.cpp @@ -14,24 +14,94 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/SpecialCaseList.h" +#include "llvm/ADT/RadixTree.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Support/GlobPattern.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Regex.h" #include "llvm/Support/VirtualFileSystem.h" -#include -#include +#include "llvm/Support/raw_ostream.h" #include #include #include #include #include +#include +#include namespace llvm { -Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern, - unsigned LineNumber) { +namespace { + +using Match = std::pair; +static constexpr Match NotMatched = {"", 0}; + +// Lagacy v1 matcher. +class RegexMatcher { +public: + Error insert(StringRef Pattern, unsigned LineNumber); + void preprocess(bool BySize); + + Match match(StringRef Query) const; + + struct Reg { + Reg(StringRef Name, unsigned LineNo, Regex &&Rg) + : Name(Name), LineNo(LineNo), Rg(std::move(Rg)) {} + StringRef Name; + unsigned LineNo; + Regex Rg; + }; + + std::vector RegExes; +}; + +class GlobMatcher { +public: + Error insert(StringRef Pattern, unsigned LineNumber); + void preprocess(bool BySize); + + Match match(StringRef Query) const; + + struct Glob { + Glob(StringRef Name, unsigned LineNo, GlobPattern &&Pattern) + : Name(Name), LineNo(LineNo), Pattern(std::move(Pattern)) {} + StringRef Name; + unsigned LineNo; + GlobPattern Pattern; + }; + + std::vector Globs; + + RadixTree, + RadixTree, + SmallVector>> + PrefixSuffixToGlob; + + RadixTree, SmallVector> + SubstrToGlob; +}; + +/// Represents a set of patterns and their line numbers +class Matcher { +public: + Matcher(bool UseGlobs, bool RemoveDotSlash); + + Error insert(StringRef Pattern, unsigned LineNumber); + void preprocess(bool BySize); + Match match(StringRef Query) const; + + bool matchAny(StringRef Query) const { return match(Query).second > 0; } + + std::variant M; + bool RemoveDotSlash; +}; + +Error RegexMatcher::insert(StringRef Pattern, unsigned LineNumber) { if (Pattern.empty()) return createStringError(errc::invalid_argument, "Supplied regex was blank"); @@ -55,7 +125,7 @@ Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern, return Error::success(); } -void SpecialCaseList::RegexMatcher::preprocess(bool BySize) { +void RegexMatcher::preprocess(bool BySize) { if (BySize) { llvm::stable_sort(RegExes, [](const Reg &A, const Reg &B) { return A.Name.size() < B.Name.size(); @@ -63,16 +133,14 @@ void SpecialCaseList::RegexMatcher::preprocess(bool BySize) { } } -void SpecialCaseList::RegexMatcher::match( - StringRef Query, - llvm::function_ref Cb) const { +Match RegexMatcher::match(StringRef Query) const { for (const auto &R : reverse(RegExes)) if (R.Rg.match(Query)) - return Cb(R.Name, R.LineNo); + return {R.Name, R.LineNo}; + return NotMatched; } -Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern, - unsigned LineNumber) { +Error GlobMatcher::insert(StringRef Pattern, unsigned LineNumber) { if (Pattern.empty()) return createStringError(errc::invalid_argument, "Supplied glob was blank"); @@ -83,14 +151,14 @@ Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern, return Error::success(); } -void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { +void GlobMatcher::preprocess(bool BySize) { if (BySize) { llvm::stable_sort(Globs, [](const Glob &A, const Glob &B) { return A.Name.size() < B.Name.size(); }); } - for (const auto &G : reverse(Globs)) { + for (const auto &[Idx, G] : enumerate(Globs)) { StringRef Prefix = G.Pattern.prefix(); StringRef Suffix = G.Pattern.suffix(); @@ -102,26 +170,28 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) { // But only if substring is not empty. Searching this tree is more // expensive. auto &V = SubstrToGlob.emplace(Substr).first->second; - V.emplace_back(&G); + V.emplace_back(Idx); continue; } } auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second; auto &V = SToGlob.emplace(reverse(Suffix)).first->second; - V.emplace_back(&G); + V.emplace_back(Idx); } } -void SpecialCaseList::GlobMatcher::match( - StringRef Query, - llvm::function_ref Cb) const { +Match GlobMatcher::match(StringRef Query) const { + int Best = -1; if (!PrefixSuffixToGlob.empty()) { for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) { for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); + for (int Idx : reverse(V)) { + if (Best > Idx) + break; + const GlobMatcher::Glob &G = Globs[Idx]; + if (G.Pattern.match(Query)) { + Best = Idx; // As soon as we find a match in the vector, we can break for this // vector, since the globs are already sorted by priority within the // prefix group. However, we continue searching other prefix groups @@ -138,9 +208,12 @@ void SpecialCaseList::GlobMatcher::match( // possibilities. In most cases search will fail on first characters. for (StringRef Q = Query; !Q.empty(); Q = Q.drop_front()) { for (const auto &[_, V] : SubstrToGlob.find_prefixes(Q)) { - for (const auto *G : V) { - if (G->Pattern.match(Query)) { - Cb(G->Name, G->LineNo); + for (int Idx : reverse(V)) { + if (Best > Idx) + break; + const GlobMatcher::Glob &G = Globs[Idx]; + if (G.Pattern.match(Query)) { + Best = Idx; // As soon as we find a match in the vector, we can break for this // vector, since the globs are already sorted by priority within the // prefix group. However, we continue searching other prefix groups @@ -151,9 +224,12 @@ void SpecialCaseList::GlobMatcher::match( } } } + if (Best < 0) + return NotMatched; + return {Globs[Best].Name, Globs[Best].LineNo}; } -SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) +Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) : RemoveDotSlash(RemoveDotSlash) { if (UseGlobs) M.emplace(); @@ -161,21 +237,34 @@ SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash) M.emplace(); } -Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) { +Error Matcher::insert(StringRef Pattern, unsigned LineNumber) { return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M); } -void SpecialCaseList::Matcher::preprocess(bool BySize) { +void Matcher::preprocess(bool BySize) { return std::visit([&](auto &V) { return V.preprocess(BySize); }, M); } -void SpecialCaseList::Matcher::match( - StringRef Query, - llvm::function_ref Cb) const { +Match Matcher::match(StringRef Query) const { if (RemoveDotSlash) Query = llvm::sys::path::remove_leading_dotslash(Query); - return std::visit([&](auto &V) { return V.match(Query, Cb); }, M); + return std::visit([&](auto &V) -> Match { return V.match(Query); }, M); } +} // namespace + +class SpecialCaseList::Section::SectionImpl { +public: + void preprocess(bool OrderBySize); + const Matcher *findMatcher(StringRef Prefix, StringRef Category) const; + + using SectionEntries = StringMap>; + + explicit SectionImpl(bool UseGlobs) + : SectionMatcher(UseGlobs, /*RemoveDotSlash=*/false) {} + + Matcher SectionMatcher; + SectionEntries Entries; +}; // TODO: Refactor this to return Expected<...> std::unique_ptr @@ -233,11 +322,11 @@ bool SpecialCaseList::createInternal(const MemoryBuffer *MB, std::string &Error, Expected SpecialCaseList::addSection(StringRef SectionStr, unsigned FileNo, unsigned LineNo, bool UseGlobs) { + SectionStr = SectionStr.copy(StrAlloc); Sections.emplace_back(SectionStr, FileNo, UseGlobs); auto &Section = Sections.back(); - SectionStr = SectionStr.copy(StrAlloc); - if (auto Err = Section.SectionMatcher.insert(SectionStr, LineNo)) { + if (auto Err = Section.Impl->SectionMatcher.insert(SectionStr, LineNo)) { return createStringError(errc::invalid_argument, "malformed section at line " + Twine(LineNo) + ": '" + SectionStr + @@ -264,11 +353,12 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, bool RemoveDotSlash = Version > 2; - Section *CurrentSection; - if (auto Err = addSection("*", FileIdx, 1, true).moveInto(CurrentSection)) { + auto ErrOrSection = addSection("*", FileIdx, 1, true); + if (auto Err = ErrOrSection.takeError()) { Error = toString(std::move(Err)); return false; } + Section::SectionImpl *CurrentImpl = ErrOrSection.get()->Impl.get(); // This is the current list of prefixes for all existing users matching file // path. We may need parametrization in constructor in future. @@ -290,12 +380,13 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, return false; } - if (auto Err = addSection(Line.drop_front().drop_back(), FileIdx, LineNo, - UseGlobs) - .moveInto(CurrentSection)) { + auto ErrOrSection = + addSection(Line.drop_front().drop_back(), FileIdx, LineNo, UseGlobs); + if (auto Err = ErrOrSection.takeError()) { Error = toString(std::move(Err)); return false; } + CurrentImpl = ErrOrSection.get()->Impl.get(); continue; } @@ -308,7 +399,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, } auto [Pattern, Category] = Postfix.split("="); - auto [It, _] = CurrentSection->Entries[Prefix].try_emplace( + auto [It, _] = CurrentImpl->Entries[Prefix].try_emplace( Category, UseGlobs, RemoveDotSlash && llvm::is_contained(PathPrefixes, Prefix)); Pattern = Pattern.copy(StrAlloc); @@ -322,7 +413,7 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB, } for (Section &S : Sections) - S.preprocess(OrderBySize); + S.Impl->preprocess(OrderBySize); return true; } @@ -339,7 +430,7 @@ std::pair SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix, StringRef Query, StringRef Category) const { for (const auto &S : reverse(Sections)) { - if (S.SectionMatcher.matchAny(Section)) { + if (S.Impl->SectionMatcher.matchAny(Section)) { unsigned Blame = S.getLastMatch(Prefix, Query, Category); if (Blame) return {S.FileIdx, Blame}; @@ -348,9 +439,22 @@ SpecialCaseList::inSectionBlame(StringRef Section, StringRef Prefix, return NotFound; } -const SpecialCaseList::Matcher * -SpecialCaseList::Section::findMatcher(StringRef Prefix, - StringRef Category) const { +SpecialCaseList::Section::Section(StringRef Str, unsigned FileIdx, + bool UseGlobs) + : Name(Str), FileIdx(FileIdx), + Impl(std::make_unique(UseGlobs)) {} + +SpecialCaseList::Section::Section(Section &&) = default; + +SpecialCaseList::Section::~Section() = default; + +bool SpecialCaseList::Section::matchName(StringRef Name) const { + return Impl->SectionMatcher.matchAny(Name); +} + +const Matcher * +SpecialCaseList::Section::SectionImpl::findMatcher(StringRef Prefix, + StringRef Category) const { SectionEntries::const_iterator I = Entries.find(Prefix); if (I == Entries.end()) return nullptr; @@ -361,7 +465,7 @@ SpecialCaseList::Section::findMatcher(StringRef Prefix, return &II->second; } -LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) { +void SpecialCaseList::Section::SectionImpl::preprocess(bool OrderBySize) { SectionMatcher.preprocess(false); for (auto &[K1, E] : Entries) for (auto &[K2, M] : E) @@ -371,26 +475,21 @@ LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) { unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix, StringRef Query, StringRef Category) const { - unsigned LastLine = 0; - if (const Matcher *M = findMatcher(Prefix, Category)) { - M->match(Query, [&](StringRef, unsigned LineNo) { - LastLine = std::max(LastLine, LineNo); - }); - } - return LastLine; + if (const Matcher *M = Impl->findMatcher(Prefix, Category)) + return M->match(Query).second; + return 0; } StringRef SpecialCaseList::Section::getLongestMatch(StringRef Prefix, StringRef Query, StringRef Category) const { - StringRef LongestRule; - if (const Matcher *M = findMatcher(Prefix, Category)) { - M->match(Query, [&](StringRef Rule, unsigned) { - if (LongestRule.size() < Rule.size()) - LongestRule = Rule; - }); - } - return LongestRule; + if (const Matcher *M = Impl->findMatcher(Prefix, Category)) + return M->match(Query).first; + return {}; +} + +bool SpecialCaseList::Section::hasPrefix(StringRef Prefix) const { + return Impl->Entries.find(Prefix) != Impl->Entries.end(); } } // namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index cb831963759b5..7712d2a1d88d8 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -629,8 +629,7 @@ void SSACCmpConv::convert(SmallVectorImpl &RemovedBlocks) { } const MCInstrDesc &MCID = TII->get(Opc); // Create a dummy virtual register for the SUBS def. - Register DestReg = - MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI)); + Register DestReg = MRI->createVirtualRegister(TII->getRegClass(MCID, 0)); // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. BuildMI(*Head, Head->end(), TermDL, MCID) .addReg(DestReg, RegState::Define | RegState::Dead) @@ -638,8 +637,7 @@ void SSACCmpConv::convert(SmallVectorImpl &RemovedBlocks) { .addImm(0) .addImm(0); // SUBS uses the GPR*sp register classes. - MRI->constrainRegClass(HeadCond[2].getReg(), - TII->getRegClass(MCID, 1, TRI)); + MRI->constrainRegClass(HeadCond[2].getReg(), TII->getRegClass(MCID, 1)); } Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end()); @@ -686,10 +684,10 @@ void SSACCmpConv::convert(SmallVectorImpl &RemovedBlocks) { unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC); const MCInstrDesc &MCID = TII->get(Opc); MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(), - TII->getRegClass(MCID, 0, TRI)); + TII->getRegClass(MCID, 0)); if (CmpMI->getOperand(FirstOp + 1).isReg()) MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(), - TII->getRegClass(MCID, 1, TRI)); + TII->getRegClass(MCID, 1)); MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) .add(CmpMI->getOperand(FirstOp)); // Register Rn if (isZBranch) diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 75361f5d313c6..4ff49a627c794 100644 --- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -156,7 +156,7 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( LLVM_DEBUG(dbgs() << " Ignoring, def is tied operand.\n"); continue; } - const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, I); unsigned NewReg; if (RC == nullptr) { LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n"); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 55a04cca4c394..b93e562f4cee5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -91,7 +91,7 @@ static cl::opt GatherOptSearchLimit( "machine-combiner gather pattern optimization")); AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) - : AArch64GenInstrInfo(STI, AArch64::ADJCALLSTACKDOWN, + : AArch64GenInstrInfo(STI, RI, AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {} @@ -1780,6 +1780,16 @@ static unsigned sForm(MachineInstr &Instr) { case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: return Instr.getOpcode(); case AArch64::ADDWrr: @@ -1810,6 +1820,22 @@ static unsigned sForm(MachineInstr &Instr) { return AArch64::ANDSWri; case AArch64::ANDXri: return AArch64::ANDSXri; + case AArch64::ANDWrr: + return AArch64::ANDSWrr; + case AArch64::ANDWrs: + return AArch64::ANDSWrs; + case AArch64::ANDXrr: + return AArch64::ANDSXrr; + case AArch64::ANDXrs: + return AArch64::ANDSXrs; + case AArch64::BICWrr: + return AArch64::BICSWrr; + case AArch64::BICXrr: + return AArch64::BICSXrr; + case AArch64::BICWrs: + return AArch64::BICSWrs; + case AArch64::BICXrs: + return AArch64::BICSXrs; } } @@ -1947,6 +1973,25 @@ static bool isSUBSRegImm(unsigned Opcode) { return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; } +static bool isANDOpcode(MachineInstr &MI) { + unsigned Opc = sForm(MI); + switch (Opc) { + case AArch64::ANDSWri: + case AArch64::ANDSWrr: + case AArch64::ANDSWrs: + case AArch64::ANDSXri: + case AArch64::ANDSXrr: + case AArch64::ANDSXrs: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + return true; + default: + return false; + } +} + /// Check if CmpInstr can be substituted by MI. /// /// CmpInstr can be substituted: @@ -1984,7 +2029,8 @@ static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, // 1) MI and CmpInstr set N and V to the same value. // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when // signed overflow occurs, so CmpInstr could still be simplified away. - if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap)) + // Note that Ands and Bics instructions always clear the V flag. + if (NZVCUsed->V && !MI.getFlag(MachineInstr::NoSWrap) && !isANDOpcode(MI)) return false; AccessKind AccessToCheck = AK_Write; @@ -5618,7 +5664,6 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); @@ -5632,7 +5677,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, bool Offset = true; MCRegister PNRReg = MCRegister::NoRegister; unsigned StackID = TargetStackID::Default; - switch (TRI->getSpillSize(*RC)) { + switch (RI.getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::STRBui; @@ -5795,10 +5840,12 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, .addMemOperand(MMO); } -void AArch64InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); @@ -5810,7 +5857,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( bool Offset = true; unsigned StackID = TargetStackID::Default; Register PNRReg = MCRegister::NoRegister; - switch (TRI->getSpillSize(*RC)) { + switch (TRI.getSpillSize(*RC)) { case 1: if (AArch64::FPR8RegClass.hasSubClassEq(RC)) Opc = AArch64::LDRBui; @@ -6446,10 +6493,10 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( "Mismatched register size in non subreg COPY"); if (IsSpill) storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, - getRegClass(SrcReg), &TRI, Register()); + getRegClass(SrcReg), Register()); else loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, - getRegClass(DstReg), &TRI, Register()); + getRegClass(DstReg), Register()); return &*--InsertPt; } @@ -6467,8 +6514,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( assert(SrcMO.getSubReg() == 0 && "Unexpected subreg on physical register"); storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(), - FrameIndex, &AArch64::GPR64RegClass, &TRI, - Register()); + FrameIndex, &AArch64::GPR64RegClass, Register()); return &*--InsertPt; } @@ -6502,7 +6548,7 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == TRI.getRegSizeInBits(*FillRC) && "Mismatched regclass size on folded subreg COPY"); - loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI, + loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, Register()); MachineInstr &LoadMI = *--InsertPt; MachineOperand &LoadDst = LoadMI.getOperand(0); @@ -11017,8 +11063,6 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, MachineBasicBlock::iterator InsertTo) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = - MBB.getParent()->getSubtarget().getRegisterInfo(); MachineInstr *NewMI = MBB.getParent()->CloneMachineInstr(MI); Register Result = 0; for (unsigned I = 0; I < NewMI->getNumOperands(); ++I) { @@ -11027,8 +11071,7 @@ static Register cloneInstr(const MachineInstr *MI, unsigned ReplaceOprNum, MRI.getRegClass(NewMI->getOperand(0).getReg())); NewMI->getOperand(I).setReg(Result); } else if (I == ReplaceOprNum) { - MRI.constrainRegClass(ReplaceReg, - TII->getRegClass(NewMI->getDesc(), I, TRI)); + MRI.constrainRegClass(ReplaceReg, TII->getRegClass(NewMI->getDesc(), I)); NewMI->getOperand(I).setReg(ReplaceReg); } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 179574a73aa01..979c9acbd48e1 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -353,14 +353,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // This tells target independent code that it is okay to pass instructions diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 04e76c7abd202..d25db89cca358 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -595,17 +595,17 @@ bool AArch64MIPeepholeOpt::splitTwoPartImm( // Determine register classes for destinations and register operands const TargetRegisterClass *FirstInstrDstRC = - TII->getRegClass(TII->get(Opcode.first), 0, TRI); + TII->getRegClass(TII->get(Opcode.first), 0); const TargetRegisterClass *FirstInstrOperandRC = - TII->getRegClass(TII->get(Opcode.first), 1, TRI); + TII->getRegClass(TII->get(Opcode.first), 1); const TargetRegisterClass *SecondInstrDstRC = (Opcode.first == Opcode.second) ? FirstInstrDstRC - : TII->getRegClass(TII->get(Opcode.second), 0, TRI); + : TII->getRegClass(TII->get(Opcode.second), 0); const TargetRegisterClass *SecondInstrOperandRC = (Opcode.first == Opcode.second) ? FirstInstrOperandRC - : TII->getRegClass(TII->get(Opcode.second), 1, TRI); + : TII->getRegClass(TII->get(Opcode.second), 1); // Get old registers destinations and new register destinations Register DstReg = MI.getOperand(0).getReg(); @@ -784,14 +784,14 @@ bool AArch64MIPeepholeOpt::visitUBFMXri(MachineInstr &MI) { } const TargetRegisterClass *DstRC64 = - TII->getRegClass(TII->get(MI.getOpcode()), 0, TRI); + TII->getRegClass(TII->get(MI.getOpcode()), 0); const TargetRegisterClass *DstRC32 = TRI->getSubRegisterClass(DstRC64, AArch64::sub_32); assert(DstRC32 && "Destination register class of UBFMXri doesn't have a " "sub_32 subregister class"); const TargetRegisterClass *SrcRC64 = - TII->getRegClass(TII->get(MI.getOpcode()), 1, TRI); + TII->getRegClass(TII->get(MI.getOpcode()), 1); const TargetRegisterClass *SrcRC32 = TRI->getSubRegisterClass(SrcRC64, AArch64::sub_32); assert(SrcRC32 && "Source register class of UBFMXri doesn't have a sub_32 " diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index eaf8723094797..f3cf222038072 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -897,7 +897,7 @@ AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, const MCInstrDesc &MCID = TII->get(AArch64::ADDXri); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); Register BaseReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass); - MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this)); + MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0)); unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); BuildMI(*MBB, Ins, DL, MCID, BaseReg) diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 8974965c41fe3..ab4004e30f629 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -157,7 +157,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { return usePostRAScheduler(); } bool enableSubRegLiveness() const override { return EnableSubregLiveness; } - + bool enableTerminalRule() const override { return true; } bool enableMachinePipeliner() const override; bool useDFAforSMS() const override { return false; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 54d94b1f8682e..0b61adf409948 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -2069,6 +2069,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureMemoryAtomicFAddF32DenormalSupport, FeatureBVHDualAndBVH8Insts, FeatureWaitsBeforeSystemScopeStores, + FeatureD16Writes32BitVgpr ]>; def FeatureISAVersion12_50 : FeatureSet< @@ -2143,6 +2144,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureSupportsXNACK, FeatureXNACK, FeatureClusters, + FeatureD16Writes32BitVgpr, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c1ee3a2ac6a89..125e212a1b946 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5081,17 +5081,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned MinNumRegsRequired = DstSize / 32; const SIMachineFunctionInfo *Info = MF.getInfo(); + bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired); + OpdsMapping[0] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[4] = - Info->getMinNumAGPRs() >= MinNumRegsRequired - ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) - : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); + UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI) + : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); OpdsMapping[8] = getVGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); OpdsMapping[10] = getVGPROpMapping(MI.getOperand(10).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 0ea9add891111..b03d50f2d451d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -261,13 +261,6 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( const Function *Callee = getCalleeFunction(*CalleeOp); - // Avoid crashing on undefined behavior with an illegal call to a - // kernel. If a callsite's calling convention doesn't match the - // function's, it's undefined behavior. If the callsite calling - // convention does match, that would have errored earlier. - if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - auto isSameFunction = [](const MachineFunction &MF, const Function *F) { return F == &MF.getFunction(); }; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index c18bd70099a33..2da6a8e5652ef 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -2011,7 +2011,7 @@ void PreRARematStage::rematerialize() { // Rematerialize DefMI to its use block. TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, - AMDGPU::NoSubRegister, *DefMI, *DAG.TRI); + AMDGPU::NoSubRegister, *DefMI); Remat.RematMI = &*std::prev(InsertPos); DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI); @@ -2158,8 +2158,7 @@ void PreRARematStage::finalizeGCNSchedStage() { // Re-rematerialize MI at the end of its original region. Note that it may // not be rematerialized exactly in the same position as originally within // the region, but it should not matter much. - TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI, - *DAG.TRI); + TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI); MachineInstr *NewMI = &*std::prev(InsertPos); DAG.LIS->InsertMachineInstrInMaps(*NewMI); diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 3e256cce97afb..01040854e1577 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -29,7 +29,7 @@ using namespace llvm; #include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : R600GenInstrInfo(ST, -1, -1), RI(), ST(ST) {} + : R600GenInstrInfo(ST, RI, -1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 278101d6d608e..f7b09cec4c542 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -713,7 +713,7 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { // Verify the register is compatible with the operand. if (const TargetRegisterClass *OpRC = - TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) { + TII->getRegClass(MI->getDesc(), Fold.UseOpNo)) { const TargetRegisterClass *NewRC = TRI->getRegClassForReg(*MRI, New->getReg()); @@ -2393,7 +2393,7 @@ bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) { unsigned OpIdx = Op - &UseMI->getOperand(0); const MCInstrDesc &InstDesc = UseMI->getDesc(); - const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx, TRI); + const TargetRegisterClass *OpRC = TII->getRegClass(InstDesc, OpIdx); if (!OpRC || !TRI->isVectorSuperClass(OpRC)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 89517a8f2d78c..b3c351a5ba6ce 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,7 +63,8 @@ static cl::opt Fix16BitCopies( cl::ReallyHidden); SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) - : AMDGPUGenInstrInfo(ST, AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + : AMDGPUGenInstrInfo(ST, RI, AMDGPU::ADJCALLSTACKUP, + AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { SchedModel.init(&ST); } @@ -1704,7 +1705,7 @@ void SIInstrInfo::storeRegToStackSlotImpl( MachineMemOperand *MMO = MF->getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), FrameInfo.getObjectAlign(FrameIndex)); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachineRegisterInfo &MRI = MF->getRegInfo(); if (RI.isSGPRClass(RC)) { @@ -1750,9 +1751,9 @@ void SIInstrInfo::storeRegToStackSlotImpl( void SIInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags) const { - storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, TRI, VReg, + storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, &TRI, VReg, Flags, false); } @@ -1906,14 +1907,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned SpillSize = TRI->getSpillSize(*RC); + unsigned SpillSize = RI.getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); @@ -2562,8 +2562,8 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - unsigned SubIdx, const MachineInstr &Orig, - const TargetRegisterInfo &RI) const { + unsigned SubIdx, + const MachineInstr &Orig) const { // Try shrinking the instruction to remat only the part needed for current // context. @@ -2613,7 +2613,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, const MCInstrDesc &TID = get(NewOpcode); const TargetRegisterClass *NewRC = - RI.getAllocatableClass(getRegClass(TID, 0, &RI)); + RI.getAllocatableClass(getRegClass(TID, 0)); MRI.setRegClass(DestReg, NewRC); UseMO->setReg(DestReg); @@ -2643,7 +2643,7 @@ void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, break; } - TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI); + TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig); } std::pair @@ -3656,7 +3656,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) { const MCInstrDesc &MovDesc = get(MovOp); - const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI); + const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0); if (Is16Bit) { // We just need to find a correctly sized register class, so the // subregister index compatibility doesn't matter since we're statically @@ -6079,9 +6079,8 @@ SIInstrInfo::getWholeWaveFunctionSetup(MachineFunction &MF) const { // FIXME: This should not be an overridable function. All subtarget dependent // operand modifications should go through isLookupRegClassByHwMode in the // generic handling. -const TargetRegisterClass * -SIInstrInfo::getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { +const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID, + unsigned OpNum) const { if (OpNum >= TID.getNumOperands()) return nullptr; const MCOperandInfo &OpInfo = TID.operands()[OpNum]; @@ -6856,7 +6855,7 @@ void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, return; const TargetRegisterClass *DeclaredRC = - getRegClass(MI.getDesc(), SAddr->getOperandNo(), &RI); + getRegClass(MI.getDesc(), SAddr->getOperandNo()); Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC); SAddr->setReg(ToSGPR); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 97ca9edd97c6a..44470bd92c7b0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -323,22 +323,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp // instructions. Returns a pair of generated instructions. @@ -1639,9 +1636,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// Return true if this opcode should not be used by codegen. bool isAsmOnlyOpcode(int MCOp) const; - const TargetRegisterClass * - getRegClass(const MCInstrDesc &TID, unsigned OpNum, - const TargetRegisterInfo *TRI) const override; + const TargetRegisterClass *getRegClass(const MCInstrDesc &TID, + unsigned OpNum) const override; void fixImplicitOperands(MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index e8a6bce317f3e..09bd7a0b622ef 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -233,11 +233,11 @@ class SILoadStoreOptimizer { void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - AMDGPU::OpName OpName, Register DestReg, - MachineInstr *NewMI) const; + const DebugLoc &DL, AMDGPU::OpName OpName, + Register DestReg, MachineInstr *NewMI) const; Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, - AMDGPU::OpName OpName) const; + const DebugLoc &DL, AMDGPU::OpName OpName) const; unsigned read2Opcode(unsigned EltSize) const; unsigned read2ST64Opcode(unsigned EltSize) const; @@ -1337,11 +1337,9 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(), AMDGPU::OpName::data1); - const TargetRegisterClass *DataRC0 = - TII->getRegClass(Write2Opc, Data0Idx, TRI); + const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx); - const TargetRegisterClass *DataRC1 = - TII->getRegClass(Write2Opc, Data1Idx, TRI); + const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx); if (unsigned SubReg = Data0->getSubReg()) { DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()), @@ -1368,11 +1366,10 @@ SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, // Paired. void SILoadStoreOptimizer::copyToDestRegs( CombineInfo &CI, CombineInfo &Paired, - MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName, - Register DestReg, MachineInstr *NewMI) const { + MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL, + AMDGPU::OpName OpName, Register DestReg, MachineInstr *NewMI) const { MachineBasicBlock *MBB = CI.I->getParent(); MachineFunction *MF = MBB->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); @@ -1411,9 +1408,9 @@ void SILoadStoreOptimizer::copyToDestRegs( Register SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore, + const DebugLoc &DL, AMDGPU::OpName OpName) const { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); @@ -1469,7 +1466,8 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); Register DestReg = MRI->createVirtualRegister(SuperRC); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); @@ -1497,7 +1495,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, .addImm(0) // gds .cloneMergedMemRefs({&*CI.I, &*Paired.I}); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg, + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg, Read2); CI.I->eraseFromParent(); @@ -1555,7 +1553,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); Register BaseReg = AddrReg->getReg(); unsigned BaseSubReg = AddrReg->getSubReg(); @@ -1596,7 +1595,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1621,7 +1622,7 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg, New); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg, New); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1632,7 +1633,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); + const unsigned Opcode = getNewOpcode(CI, Paired); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1653,7 +1656,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( New.addImm(MergedOffset); New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg, New); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg, New); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1664,7 +1667,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1694,7 +1699,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg, New); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg, New); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1705,7 +1710,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1745,7 +1752,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg, New); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg, New); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1756,12 +1763,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); @@ -1803,7 +1811,9 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); @@ -1821,7 +1831,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( .addImm(CI.CPol) .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); - copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg, New); + copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg, New); CI.I->eraseFromParent(); Paired.I->eraseFromParent(); @@ -1832,12 +1842,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) @@ -2108,12 +2120,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( CombineInfo &CI, CombineInfo &Paired, MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); - DebugLoc DL = CI.I->getDebugLoc(); + DebugLoc DL = + DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc()); const unsigned Opcode = getNewOpcode(CI, Paired); Register SrcReg = - copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); + copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata); auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) .addReg(SrcReg, RegState::Kill); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index a78e19218c315..ce2f5ccef428e 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -106,10 +106,10 @@ static void insertCSRSaves(const GCNSubtarget &ST, MachineBasicBlock &SaveBlock, SlotIndexes *Indexes, LiveIntervals *LIS) { const TargetFrameLowering *TFI = ST.getFrameLowering(); - const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); MachineBasicBlock::iterator I = SaveBlock.begin(); MachineInstrSpan MIS(I, &SaveBlock); - bool Success = TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI); + bool Success = TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, RI); assert(Success && "spillCalleeSavedRegisters should always succeed"); (void)Success; diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index caff354c73510..86ca22cfeffd8 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1346,7 +1346,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, continue; unsigned I = Op.getOperandNo(); - const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I, TRI); + const TargetRegisterClass *OpRC = TII->getRegClass(Desc, I); if (!OpRC || !TRI->isVSSuperClass(OpRC)) continue; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 54f57e02ed47e..85adcab55b742 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -513,6 +513,13 @@ defm V_CVT_U16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_u16_f16", defm V_CVT_I16_F16 : VOP1Inst_t16_with_profiles <"v_cvt_i16_f16", VOP_I16_F16_SPECIAL_OMOD, VOP_I16_F16_SPECIAL_OMOD_t16, VOP_I16_F16_SPECIAL_OMOD_fake16, fp_to_sint>; + +let HasClamp = 0, HasOMod = 0 in { +def V_TRANS_BF16_Profile : VOPProfile <[bf16, bf16, untyped, untyped]>; +def V_TRANS_BF16_t16_Profile : VOPProfile_True16 ; +def V_TRANS_BF16_fake16_Profile : VOPProfile_Fake16 ; +} + let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; @@ -527,14 +534,30 @@ defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; } let SubtargetPredicate = HasBF16TransInsts in { -defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; -defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; -defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; -defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; -defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; -defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; -defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; -defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; +defm V_TANH_BF16 : VOP1Inst_t16_with_profiles<"v_tanh_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + int_amdgcn_tanh>; +defm V_RCP_BF16 : VOP1Inst_t16_with_profiles<"v_rcp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrcp>; +defm V_SQRT_BF16 : VOP1Inst_t16_with_profiles<"v_sqrt_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16_with_profiles<"v_rsq_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16_with_profiles<"v_log_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16_with_profiles<"v_exp_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16_with_profiles<"v_sin_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16_with_profiles<"v_cos_bf16", V_TRANS_BF16_Profile, + V_TRANS_BF16_t16_Profile, V_TRANS_BF16_fake16_Profile, + AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 4e63ad54bef27..94bb60861d670 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1357,8 +1357,12 @@ class VOPBinOpClampPat : class getVOP3ModPat { dag src0 = !if(P.HasOMod, - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), - (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i32:$omod)), + !if(P.HasClamp, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers))); list ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp.ret (P.Src0VT src0), diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp index 05bcb3596ac48..e17ecbf87faae 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp +++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp @@ -44,7 +44,8 @@ enum TSFlagsConstants { void ARCInstrInfo::anchor() {} ARCInstrInfo::ARCInstrInfo(const ARCSubtarget &ST) - : ARCGenInstrInfo(ST, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), RI(ST) {} + : ARCGenInstrInfo(ST, RI, ARC::ADJCALLSTACKDOWN, ARC::ADJCALLSTACKUP), + RI(ST) {} static bool isZeroImm(const MachineOperand &Op) { return Op.isImm() && Op.getImm() == 0; @@ -293,8 +294,7 @@ void ARCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void ARCInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); @@ -306,11 +306,11 @@ void ARCInstrInfo::storeRegToStackSlot( MFI.getObjectAlign(FrameIndex)); assert(MMO && "Couldn't get MachineMemOperand for store to stack."); - assert(TRI->getSpillSize(*RC) == 4 && + assert(TRI.getSpillSize(*RC) == 4 && "Only support 4-byte stores to stack now."); assert(ARC::GPR32RegClass.hasSubClassEq(RC) && "Only support GPR32 stores to stack now."); - LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, TRI) + LLVM_DEBUG(dbgs() << "Created store reg=" << printReg(SrcReg, &TRI) << " to FrameIndex=" << FrameIndex << "\n"); BuildMI(MBB, I, DL, get(ARC::ST_rs9)) .addReg(SrcReg, getKillRegState(IsKill)) @@ -323,7 +323,6 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); @@ -335,11 +334,11 @@ void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MFI.getObjectAlign(FrameIndex)); assert(MMO && "Couldn't get MachineMemOperand for store to stack."); - assert(TRI->getSpillSize(*RC) == 4 && + assert(TRI.getSpillSize(*RC) == 4 && "Only support 4-byte loads from stack now."); assert(ARC::GPR32RegClass.hasSubClassEq(RC) && "Only support GPR32 stores to stack now."); - LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, TRI) + LLVM_DEBUG(dbgs() << "Created load reg=" << printReg(DestReg, &TRI) << " from FrameIndex=" << FrameIndex << "\n"); BuildMI(MBB, I, DL, get(ARC::LD_rs9)) .addReg(DestReg, RegState::Define) diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.h b/llvm/lib/Target/ARC/ARCInstrInfo.h index 2cf05ba57bd4b..ebeaf877f8436 100644 --- a/llvm/lib/Target/ARC/ARCInstrInfo.h +++ b/llvm/lib/Target/ARC/ARCInstrInfo.h @@ -70,14 +70,12 @@ class ARCInstrInfo : public ARCGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 22769dbf38719..6077c18463240 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -107,8 +107,9 @@ static const ARM_MLxEntry ARM_MLxTable[] = { { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true }, }; -ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI) - : ARMGenInstrInfo(STI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), +ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget &STI, + const ARMBaseRegisterInfo &TRI) + : ARMGenInstrInfo(STI, TRI, ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP), Subtarget(STI) { for (unsigned i = 0, e = std::size(ARM_MLxTable); i != e; ++i) { if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) @@ -928,15 +929,15 @@ ARMBaseInstrInfo::describeLoadedValue(const MachineInstr &MI, return TargetInstrInfo::describeLoadedValue(MI, Reg); } -const MachineInstrBuilder & -ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg, - unsigned SubIdx, unsigned State, - const TargetRegisterInfo *TRI) const { +const MachineInstrBuilder &ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, + unsigned Reg, + unsigned SubIdx, + unsigned State) const { if (!SubIdx) return MIB.addReg(Reg, State); if (Register::isPhysicalRegister(Reg)) - return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); + return MIB.addReg(getRegisterInfo().getSubReg(Reg, SubIdx), State); return MIB.addReg(Reg, State, SubIdx); } @@ -944,18 +945,18 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); Align Alignment = MFI.getObjectAlign(FI); + const ARMBaseRegisterInfo &TRI = getRegisterInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, MFI.getObjectSize(FI), Alignment); - switch (TRI->getSpillSize(*RC)) { + switch (TRI.getSpillSize(*RC)) { case 2: if (ARM::HPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DebugLoc(), get(ARM::VSTRH)) @@ -1010,8 +1011,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) { if (Subtarget.hasV5TEOps()) { MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::STRD)); - AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); - AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill)); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0); MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO) .add(predOps(ARMCC::AL)); } else { @@ -1021,8 +1022,8 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); - AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); - AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill)); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0); } } else llvm_unreachable("Unknown reg class!"); @@ -1072,9 +1073,9 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); - AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill)); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0); + AddDReg(MIB, SrcReg, ARM::dsub_2, 0); } } else llvm_unreachable("Unknown reg class!"); @@ -1104,10 +1105,10 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); - AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill)); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0); + AddDReg(MIB, SrcReg, ARM::dsub_3, 0); } } else llvm_unreachable("Unknown reg class!"); @@ -1124,14 +1125,14 @@ void ARMBaseInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI); - MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI); - AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill)); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0); + MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0); + AddDReg(MIB, SrcReg, ARM::dsub_7, 0); } else llvm_unreachable("Unknown reg class!"); break; @@ -1207,10 +1208,12 @@ Register ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, return false; } -void ARMBaseInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void ARMBaseInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); @@ -1220,7 +1223,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Alignment); - switch (TRI->getSpillSize(*RC)) { + const ARMBaseRegisterInfo &TRI = getRegisterInfo(); + switch (TRI.getSpillSize(*RC)) { case 2: if (ARM::HPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::VLDRH), DestReg) @@ -1271,8 +1275,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( if (Subtarget.hasV5TEOps()) { MIB = BuildMI(MBB, I, DL, get(ARM::LDRD)); - AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); - AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead); + AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead); MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO) .add(predOps(ARMCC::AL)); } else { @@ -1282,8 +1286,8 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); - MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead); } if (DestReg.isPhysical()) @@ -1329,9 +1333,9 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead); if (DestReg.isPhysical()) MIB.addReg(DestReg, RegState::ImplicitDefine); } @@ -1358,10 +1362,10 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead); if (DestReg.isPhysical()) MIB.addReg(DestReg, RegState::ImplicitDefine); } @@ -1379,14 +1383,14 @@ void ARMBaseInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .add(predOps(ARMCC::AL)) .addMemOperand(MMO); - MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI); - MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI); + MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead); + MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead); if (DestReg.isPhysical()) MIB.addReg(DestReg, RegState::ImplicitDefine); } else @@ -1652,8 +1656,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { + const MachineInstr &Orig) const { unsigned Opcode = Orig.getOpcode(); switch (Opcode) { default: { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 2869e7f708046..04e2ab055cf1a 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -44,7 +44,8 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { protected: // Can be only subclassed. - explicit ARMBaseInstrInfo(const ARMSubtarget &STI); + explicit ARMBaseInstrInfo(const ARMSubtarget &STI, + const ARMBaseRegisterInfo &TRI); void expandLoadStackGuardBase(MachineBasicBlock::iterator MI, unsigned LoadImmOpc, unsigned LoadOpc) const; @@ -125,7 +126,11 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { // if there is not such an opcode. virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0; - virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0; + const ARMBaseRegisterInfo &getRegisterInfo() const { + return static_cast( + TargetInstrInfo::getRegisterInfo()); + } + const ARMSubtarget &getSubtarget() const { return Subtarget; } ScheduleHazardRecognizer * @@ -211,14 +216,13 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; @@ -227,16 +231,14 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; MachineInstr & duplicate(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MachineInstr &Orig) const override; const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, - unsigned SubIdx, unsigned State, - const TargetRegisterInfo *TRI) const; + unsigned SubIdx, unsigned State) const; bool produceSameValue(const MachineInstr &MI0, const MachineInstr &MI1, const MachineRegisterInfo *MRI) const override; diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index ce1cdb35116cc..80921ce4fb4dd 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -708,7 +708,7 @@ ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const MCInstrDesc &MCID = TII.get(ADDriOpc); Register BaseReg = MRI.createVirtualRegister(&ARM::GPRRegClass); - MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this)); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0)); MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg) .addFrameIndex(FrameIdx).addImm(Offset); @@ -881,8 +881,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Register PredReg = (PIdx == -1) ? Register() : MI.getOperand(PIdx+1).getReg(); const MCInstrDesc &MCID = MI.getDesc(); - const TargetRegisterClass *RegClass = - TII.getRegClass(MCID, FIOperandNum, this); + const TargetRegisterClass *RegClass = TII.getRegClass(MCID, FIOperandNum); if (Offset == 0 && (FrameReg.isVirtual() || RegClass->contains(FrameReg))) // Must be addrmode4/6. diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 138981ad92a87..21a113572ce93 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -2342,7 +2342,6 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, const ARMFunctionInfo *AFI = MF.getInfo(); const ARMBaseInstrInfo &TII = *static_cast(MF.getSubtarget().getInstrInfo()); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); unsigned Limit = (1 << 12) - 1; for (auto &MBB : MF) { for (auto &MI : MBB) { @@ -2364,7 +2363,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF, break; const MCInstrDesc &MCID = MI.getDesc(); - const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i, TRI); + const TargetRegisterClass *RegClass = TII.getRegClass(MCID, i); if (RegClass && !RegClass->contains(ARM::SP)) HasNonSPFrameIndex = true; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp index c684de7252e5d..f37054736b730 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -25,7 +25,8 @@ #include "llvm/MC/MCInst.h" using namespace llvm; -ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {} +ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) + : ARMBaseInstrInfo(STI, RI) {} /// Return the noop instruction to use for a noop. MCInst ARMInstrInfo::getNop() const { diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.h b/llvm/lib/Target/ARM/ARMInstrInfo.h index 178d7a2c630e4..9feaf1440f2b2 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMInstrInfo.h @@ -35,7 +35,7 @@ class ARMInstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ARMRegisterInfo &getRegisterInfo() const override { return RI; } + const ARMRegisterInfo &getRegisterInfo() const { return RI; } private: void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override; diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index cd4299b7a1a53..db37b769efcad 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2424,7 +2424,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps( Ops.pop_back(); const MCInstrDesc &MCID = TII->get(NewOpc); - const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0); MRI->constrainRegClass(FirstReg, TRC); MRI->constrainRegClass(SecondReg, TRC); @@ -3014,7 +3014,7 @@ static void AdjustBaseAndOffset(MachineInstr *MI, Register NewBaseReg, MachineFunction *MF = MI->getMF(); MachineRegisterInfo &MRI = MF->getRegInfo(); const MCInstrDesc &MCID = TII->get(MI->getOpcode()); - const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp, TRI); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, BaseOp); MRI.constrainRegClass(NewBaseReg, TRC); int OldOffset = MI->getOperand(BaseOp + 1).getImm(); @@ -3071,10 +3071,10 @@ static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, const MCInstrDesc &MCID = TII->get(NewOpcode); // Constrain the def register class - const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI); + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0); MRI.constrainRegClass(NewReg, TRC); // And do the same for the base operand - TRC = TII->getRegClass(MCID, 2, TRI); + TRC = TII->getRegClass(MCID, 2); MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC); unsigned AddrMode = (MCID.TSFlags & ARMII::AddrModeMask); diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 4a0883cc662e7..34baa3108402c 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -377,6 +377,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool isRWPI() const; bool useMachineScheduler() const { return UseMISched; } + bool enableTerminalRule() const override { return true; } bool useMachinePipeliner() const { return UseMIPipeliner; } bool hasMinSize() const { return OptMinSize; } bool isThumb1Only() const { return isThumb() && !hasThumb2(); } diff --git a/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/llvm/lib/Target/ARM/MLxExpansionPass.cpp index 8e1bf1d957400..eb237b4275cc9 100644 --- a/llvm/lib/Target/ARM/MLxExpansionPass.cpp +++ b/llvm/lib/Target/ARM/MLxExpansionPass.cpp @@ -283,7 +283,7 @@ MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI, const MCInstrDesc &MCID1 = TII->get(MulOpc); const MCInstrDesc &MCID2 = TII->get(AddSubOpc); - Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0, TRI)); + Register TmpReg = MRI->createVirtualRegister(TII->getRegClass(MCID1, 0)); MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg) .addReg(Src1Reg, getKillRegState(Src1Kill)) diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 4b8c2fd569ead..01f588f0cdc38 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) {} + : ARMBaseInstrInfo(STI, RI), RI(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb1InstrInfo::getNop() const { @@ -116,7 +116,6 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { assert((RC == &ARM::tGPRRegClass || @@ -142,10 +141,12 @@ void Thumb1InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } } -void Thumb1InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void Thumb1InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { assert((RC->hasSuperClassEq(&ARM::tGPRRegClass) || (DestReg.isPhysical() && isARMLowRegister(DestReg))) && "Unknown regclass!"); diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h index 68b326c0ebef6..289a30a4ca1e4 100644 --- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -35,7 +35,7 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const { return RI; } void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg, Register SrcReg, @@ -43,14 +43,13 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo { bool RenamableSrc = false) const override; void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool canCopyGluedNodeDuringSchedule(SDNode *N) const override; diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index f5653d459eac8..efb92c9bcac18 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -46,7 +46,7 @@ PreferNoCSEL("prefer-no-csel", cl::Hidden, cl::init(false)); Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI) - : ARMBaseInstrInfo(STI), RI(STI) {} + : ARMBaseInstrInfo(STI, RI), RI(STI) {} /// Return the noop instruction to use for a noop. MCInst Thumb2InstrInfo::getNop() const { @@ -165,7 +165,6 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -197,20 +196,22 @@ void Thumb2InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8)); - AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI); - AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI); + AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill)); + AddDReg(MIB, SrcReg, ARM::gsub_1, 0); MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL)); return; } - ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI, + ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, Register()); } -void Thumb2InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void Thumb2InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -238,8 +239,8 @@ void Thumb2InstrInfo::loadRegFromStackSlot( } MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8)); - AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI); - AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI); + AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead); + AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead); MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL)); if (DestReg.isPhysical()) @@ -247,8 +248,7 @@ void Thumb2InstrInfo::loadRegFromStackSlot( return; } - ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI, - Register()); + ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, Register()); } void Thumb2InstrInfo::expandLoadStackGuard( @@ -564,7 +564,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, bool isSub = false; MachineFunction &MF = *MI.getParent()->getParent(); - const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx, TRI); + const TargetRegisterClass *RegClass = TII.getRegClass(Desc, FrameRegIdx); // Memory operands in inline assembly always use AddrModeT2_i12. if (Opcode == ARM::INLINEASM || Opcode == ARM::INLINEASM_BR) diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 1b0bf2d499510..1e11cb37efc05 100644 --- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -44,21 +44,20 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// - const ThumbRegisterInfo &getRegisterInfo() const override { return RI; } + const ThumbRegisterInfo &getRegisterInfo() const { return RI; } MachineInstr *optimizeSelect(MachineInstr &MI, SmallPtrSetImpl &SeenMIs, diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp index ce9908597dcac..6c37ba1411dde 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp +++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp @@ -30,8 +30,8 @@ namespace llvm { AVRInstrInfo::AVRInstrInfo(const AVRSubtarget &STI) - : AVRGenInstrInfo(STI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), RI(), - STI(STI) {} + : AVRGenInstrInfo(STI, RI, AVR::ADJCALLSTACKDOWN, AVR::ADJCALLSTACKUP), + RI(), STI(STI) {} void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -126,8 +126,7 @@ Register AVRInstrInfo::isStoreToStackSlot(const MachineInstr &MI, void AVRInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); AVRMachineFunctionInfo *AFI = MF.getInfo(); @@ -142,9 +141,9 @@ void AVRInstrInfo::storeRegToStackSlot( MFI.getObjectAlign(FrameIndex)); unsigned Opcode = 0; - if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { + if (RI.isTypeLegalForClass(*RC, MVT::i8)) { Opcode = AVR::STDPtrQRr; - } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { + } else if (RI.isTypeLegalForClass(*RC, MVT::i16)) { Opcode = AVR::STDWPtrQRr; } else { llvm_unreachable("Cannot store this register into a stack slot!"); @@ -161,7 +160,6 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction &MF = *MBB.getParent(); @@ -173,9 +171,9 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MFI.getObjectAlign(FrameIndex)); unsigned Opcode = 0; - if (TRI->isTypeLegalForClass(*RC, MVT::i8)) { + if (TRI.isTypeLegalForClass(*RC, MVT::i8)) { Opcode = AVR::LDDRdPtrQ; - } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) { + } else if (TRI.isTypeLegalForClass(*RC, MVT::i16)) { // Opcode = AVR::LDDWRdPtrQ; //: FIXME: remove this once PR13375 gets fixed Opcode = AVR::LDDWRdYQ; diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h index 759aea2010962..4db535a990451 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.h +++ b/llvm/lib/Target/AVR/AVRInstrInfo.h @@ -79,13 +79,11 @@ class AVRInstrInfo : public AVRGenInstrInfo { bool RenamableSrc = false) const override; void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp index 409f8b4c253b8..095e2497eec17 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp +++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp @@ -27,7 +27,7 @@ using namespace llvm; BPFInstrInfo::BPFInstrInfo(const BPFSubtarget &STI) - : BPFGenInstrInfo(STI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} + : BPFGenInstrInfo(STI, RI, BPF::ADJCALLSTACKDOWN, BPF::ADJCALLSTACKUP) {} void BPFInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -127,7 +127,6 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool IsKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -148,10 +147,12 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, llvm_unreachable("Can't store this register to stack slot"); } -void BPFInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h index 911e880166d29..d3ef9bc164f4a 100644 --- a/llvm/lib/Target/BPF/BPFInstrInfo.h +++ b/llvm/lib/Target/BPF/BPFInstrInfo.h @@ -39,14 +39,13 @@ class BPFInstrInfo : public BPFGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index 619a797be6dc7..34a7de8d8ae96 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -25,7 +25,7 @@ using namespace llvm; #include "CSKYGenInstrInfo.inc" CSKYInstrInfo::CSKYInstrInfo(const CSKYSubtarget &STI) - : CSKYGenInstrInfo(STI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), + : CSKYGenInstrInfo(STI, RI, CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) { v2sf = STI.hasFPUv2SingleFloat(); v2df = STI.hasFPUv2DoubleFloat(); diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp index 6579d3405cf39..057d87bc3c6a9 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp +++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp @@ -10,6 +10,7 @@ #include "DirectX.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/DXILResource.h" +#include "llvm/Frontend/HLSL/HLSLResource.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -20,6 +21,7 @@ #include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/IR/User.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Transforms/Utils/ValueMapper.h" #define DEBUG_TYPE "dxil-resource-access" @@ -44,16 +46,28 @@ static Value *calculateGEPOffset(GetElementPtrInst *GEP, Value *PrevOffset, APInt ConstantOffset(DL.getIndexTypeSizeInBits(GEP->getType()), 0); if (GEP->accumulateConstantOffset(DL, ConstantOffset)) { APInt Scaled = ConstantOffset.udiv(ScalarSize); - return ConstantInt::get(Type::getInt32Ty(GEP->getContext()), Scaled); + return ConstantInt::get(DL.getIndexType(GEP->getType()), Scaled); } - auto IndexIt = GEP->idx_begin(); - assert(cast(IndexIt)->getZExtValue() == 0 && - "GEP is not indexing through pointer"); - ++IndexIt; - Value *Offset = *IndexIt; - assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP"); - return Offset; + unsigned NumIndices = GEP->getNumIndices(); + + // If we have a single index we're indexing into a top level array. This + // generally only happens with cbuffers. + if (NumIndices == 1) + return *GEP->idx_begin(); + + // If we have two indices, this should be a simple access through a pointer. + if (NumIndices == 2) { + auto IndexIt = GEP->idx_begin(); + assert(cast(IndexIt)->getZExtValue() == 0 && + "GEP is not indexing through pointer"); + ++IndexIt; + Value *Offset = *IndexIt; + assert(++IndexIt == GEP->idx_end() && "Too many indices in GEP"); + return Offset; + } + + llvm_unreachable("Unhandled GEP structure for resource access"); } static void createTypedBufferStore(IntrinsicInst *II, StoreInst *SI, @@ -171,6 +185,127 @@ static void createRawLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset) { LI->replaceAllUsesWith(V); } +namespace { +/// Helper for building a `load.cbufferrow` intrinsic given a simple type. +struct CBufferRowIntrin { + Intrinsic::ID IID; + Type *RetTy; + unsigned int EltSize; + unsigned int NumElts; + + CBufferRowIntrin(const DataLayout &DL, Type *Ty) { + assert(Ty == Ty->getScalarType() && "Expected scalar type"); + + switch (DL.getTypeSizeInBits(Ty)) { + case 16: + IID = Intrinsic::dx_resource_load_cbufferrow_8; + RetTy = StructType::get(Ty, Ty, Ty, Ty, Ty, Ty, Ty, Ty); + EltSize = 2; + NumElts = 8; + break; + case 32: + IID = Intrinsic::dx_resource_load_cbufferrow_4; + RetTy = StructType::get(Ty, Ty, Ty, Ty); + EltSize = 4; + NumElts = 4; + break; + case 64: + IID = Intrinsic::dx_resource_load_cbufferrow_2; + RetTy = StructType::get(Ty, Ty); + EltSize = 8; + NumElts = 2; + break; + default: + llvm_unreachable("Only 16, 32, and 64 bit types supported"); + } + } +}; +} // namespace + +static void createCBufferLoad(IntrinsicInst *II, LoadInst *LI, Value *Offset, + dxil::ResourceTypeInfo &RTI) { + const DataLayout &DL = LI->getDataLayout(); + + Type *Ty = LI->getType(); + assert(!isa(Ty) && "Structs not handled yet"); + CBufferRowIntrin Intrin(DL, Ty->getScalarType()); + + StringRef Name = LI->getName(); + Value *Handle = II->getOperand(0); + + IRBuilder<> Builder(LI); + + ConstantInt *GlobalOffset = dyn_cast(II->getOperand(1)); + assert(GlobalOffset && "CBuffer getpointer index must be constant"); + + unsigned int FixedOffset = GlobalOffset->getZExtValue(); + // If we have a further constant offset we can just fold it in to the fixed + // offset. + if (auto *ConstOffset = dyn_cast_if_present(Offset)) { + FixedOffset += ConstOffset->getZExtValue(); + Offset = nullptr; + } + + Value *CurrentRow = ConstantInt::get( + Builder.getInt32Ty(), FixedOffset / hlsl::CBufferRowSizeInBytes); + unsigned int CurrentIndex = + (FixedOffset % hlsl::CBufferRowSizeInBytes) / Intrin.EltSize; + + assert(!(CurrentIndex && Offset) && + "Dynamic indexing into elements of cbuffer rows is not supported"); + // At this point if we have a non-constant offset it has to be an array + // offset, so we can assume that it's a multiple of the row size. + if (Offset) + CurrentRow = FixedOffset ? Builder.CreateAdd(CurrentRow, Offset) : Offset; + + auto *CBufLoad = Builder.CreateIntrinsic( + Intrin.RetTy, Intrin.IID, {Handle, CurrentRow}, nullptr, Name + ".load"); + auto *Elt = + Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, Name + ".extract"); + + // At this point we've loaded the first scalar of our result, but our original + // type may have been a vector. + unsigned int Remaining = + ((DL.getTypeSizeInBits(Ty) / 8) / Intrin.EltSize) - 1; + if (Remaining == 0) { + // We only have a single element, so we're done. + Value *Result = Elt; + + // However, if we loaded a <1 x T>, then we need to adjust the type. + if (auto *VT = dyn_cast(Ty)) { + assert(VT->getNumElements() == 1 && "Can't have multiple elements here"); + Result = Builder.CreateInsertElement(PoisonValue::get(VT), Result, + Builder.getInt32(0), Name); + } + LI->replaceAllUsesWith(Result); + return; + } + + // Walk each element and extract it, wrapping to new rows as needed. + SmallVector Extracts{Elt}; + while (Remaining--) { + CurrentIndex %= Intrin.NumElts; + + if (CurrentIndex == 0) { + CurrentRow = Builder.CreateAdd(CurrentRow, + ConstantInt::get(Builder.getInt32Ty(), 1)); + CBufLoad = Builder.CreateIntrinsic(Intrin.RetTy, Intrin.IID, + {Handle, CurrentRow}, nullptr, + Name + ".load"); + } + + Extracts.push_back(Builder.CreateExtractValue(CBufLoad, {CurrentIndex++}, + Name + ".extract")); + } + + // Finally, we build up the original loaded value. + Value *Result = PoisonValue::get(Ty); + for (int I = 0, E = Extracts.size(); I < E; ++I) + Result = Builder.CreateInsertElement( + Result, Extracts[I], Builder.getInt32(I), Name + formatv(".upto{}", I)); + LI->replaceAllUsesWith(Result); +} + static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI) { switch (RTI.getResourceKind()) { @@ -179,6 +314,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, case dxil::ResourceKind::RawBuffer: case dxil::ResourceKind::StructuredBuffer: return createRawLoad(II, LI, Offset); + case dxil::ResourceKind::CBuffer: + return createCBufferLoad(II, LI, Offset, RTI); case dxil::ResourceKind::Texture1D: case dxil::ResourceKind::Texture2D: case dxil::ResourceKind::Texture2DMS: @@ -190,9 +327,8 @@ static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, case dxil::ResourceKind::TextureCubeArray: case dxil::ResourceKind::FeedbackTexture2D: case dxil::ResourceKind::FeedbackTexture2DArray: - case dxil::ResourceKind::CBuffer: case dxil::ResourceKind::TBuffer: - // TODO: handle these + reportFatalUsageError("Load not yet implemented for resource type"); return; case dxil::ResourceKind::Sampler: case dxil::ResourceKind::RTAccelerationStructure: diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index 26a8728e1f37c..48a9085820471 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1169,8 +1169,8 @@ void DXILBitcodeWriter::writeModuleInfo() { // We need to hardcode a triple and datalayout that's compatible with the // historical DXIL triple and datalayout from DXC. StringRef Triple = "dxil-ms-dx"; - StringRef DL = "e-m:e-p:32:32-i1:8-i8:8-i16:32-i32:32-i64:64-" - "f16:32-f32:32-f64:64-n8:16:32:64"; + StringRef DL = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-" + "f16:16-f32:32-f64:64-n8:16:32:64"; writeStringRecord(Stream, bitc::MODULE_CODE_TRIPLE, Triple, 0 /*TODO*/); writeStringRecord(Stream, bitc::MODULE_CODE_DATALAYOUT, DL, 0 /*TODO*/); diff --git a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp index bb2efa43d818c..401881d6d0f67 100644 --- a/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXInstrInfo.cpp @@ -19,6 +19,6 @@ using namespace llvm; DirectXInstrInfo::DirectXInstrInfo(const DirectXSubtarget &STI) - : DirectXGenInstrInfo(STI) {} + : DirectXGenInstrInfo(STI, RI) {} DirectXInstrInfo::~DirectXInstrInfo() {} diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 68f53124f9db8..557a0a3f27819 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1796,7 +1796,7 @@ namespace { const MachineDominatorTree &MDT; const HexagonInstrInfo &HII; - const HexagonRegisterInfo &HRI; + [[maybe_unused]] const HexagonRegisterInfo &HRI; MachineRegisterInfo &MRI; BitTracker &BT; }; @@ -1886,7 +1886,7 @@ bool BitSimplification::matchHalf(unsigned SelfR, bool BitSimplification::validateReg(BitTracker::RegisterRef R, unsigned Opc, unsigned OpNum) { - auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum, &HRI); + auto *OpRC = HII.getRegClass(HII.get(Opc), OpNum); auto *RRC = HBS::getFinalVRegClass(R, MRI); return OpRC->hasSubClassEq(RRC); } diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index dd343d9fbe79f..df612262def5e 100644 --- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -1405,7 +1405,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg); int FI = I.getFrameIdx(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); - HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI, Register()); + HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, Register()); if (IsKill) MBB.addLiveIn(Reg); } @@ -1470,7 +1470,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, MCRegister Reg = I.getReg(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); int FI = I.getFrameIdx(); - HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI, Register()); + HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, Register()); } return true; @@ -1814,8 +1814,7 @@ bool HexagonFrameLowering::expandStoreVecPred(MachineBasicBlock &B, .addReg(SrcR, getKillRegState(IsKill)) .addReg(TmpR0, RegState::Kill); - auto *HRI = B.getParent()->getSubtarget().getRegisterInfo(); - HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, HRI, Register()); + HII.storeRegToStackSlot(B, It, TmpR1, true, FI, RC, Register()); expandStoreVec(B, std::prev(It), MRI, HII, NewRegs); NewRegs.push_back(TmpR0); @@ -1844,9 +1843,7 @@ bool HexagonFrameLowering::expandLoadVecPred(MachineBasicBlock &B, BuildMI(B, It, DL, HII.get(Hexagon::A2_tfrsi), TmpR0) .addImm(0x01010101); - MachineFunction &MF = *B.getParent(); - auto *HRI = MF.getSubtarget().getRegisterInfo(); - HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, HRI, Register()); + HII.loadRegFromStackSlot(B, It, TmpR1, FI, RC, Register()); expandLoadVec(B, std::prev(It), MRI, HII, NewRegs); BuildMI(B, It, DL, HII.get(Hexagon::V6_vandvrt), DstR) @@ -2225,7 +2222,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF, if (!Bad) { // If the addressing mode is ok, check the register class. unsigned OpNum = Load ? 0 : 2; - auto *RC = HII.getRegClass(In.getDesc(), OpNum, &HRI); + auto *RC = HII.getRegClass(In.getDesc(), OpNum); RC = getCommonRC(SI.RC, RC); if (RC == nullptr) Bad = true; @@ -2395,7 +2392,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF, HexagonBlockRanges::RegisterRef SrcRR = { SrcOp.getReg(), SrcOp.getSubReg() }; - auto *RC = HII.getRegClass(SI.getDesc(), 2, &HRI); + auto *RC = HII.getRegClass(SI.getDesc(), 2); // The this-> is needed to unconfuse MSVC. Register FoundR = this->findPhysReg(MF, Range, IM, DM, RC); LLVM_DEBUG(dbgs() << "Replacement reg:" << printReg(FoundR, &HRI) diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 55bafdea234fd..7682af4543b7c 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -118,9 +118,9 @@ const int Hexagon_ADDI_OFFSET_MIN = -32768; void HexagonInstrInfo::anchor() {} HexagonInstrInfo::HexagonInstrInfo(const HexagonSubtarget &ST) - : HexagonGenInstrInfo(ST, Hexagon::ADJCALLSTACKDOWN, + : HexagonGenInstrInfo(ST, RegInfo, Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), - Subtarget(ST) {} + RegInfo(ST.getHwMode()), Subtarget(ST) {} namespace llvm { namespace HexagonFUnits { @@ -964,7 +964,6 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); @@ -1009,10 +1008,12 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } } -void HexagonInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index 48adf82833f51..796b978a2c3f0 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -23,6 +23,8 @@ #include #include +#include "HexagonRegisterInfo.h" + #define GET_INSTRINFO_HEADER #include "HexagonGenInstrInfo.inc" @@ -36,6 +38,7 @@ class MachineOperand; class TargetRegisterInfo; class HexagonInstrInfo : public HexagonGenInstrInfo { + const HexagonRegisterInfo RegInfo; const HexagonSubtarget &Subtarget; enum BundleAttribute { @@ -47,6 +50,8 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { public: explicit HexagonInstrInfo(const HexagonSubtarget &ST); + const HexagonRegisterInfo &getRegisterInfo() const { return RegInfo; } + /// TargetInstrInfo overrides. /// If the specified machine instruction is a direct @@ -183,8 +188,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { /// is true, the register operand is the last use and must be marked kill. void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// Load the specified register of the given register class from the specified @@ -193,7 +197,7 @@ class HexagonInstrInfo : public HexagonGenInstrInfo { void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// This function is called for all pseudo instructions diff --git a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp index 7cbd81ff227e1..54969b2317ef4 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoadStoreWidening.cpp @@ -646,7 +646,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG, MachineInstr *CombI; if (Acc != 0) { const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi); - const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI); + const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0); Register VReg = MF->getRegInfo().createVirtualRegister(RC); MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(LowerAcc); NG.push_back(TfrI); @@ -677,7 +677,7 @@ bool HexagonLoadStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG, } else { // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi); - const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI); + const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0); Register VReg = MF->getRegInfo().createVirtualRegister(RC); MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg).addImm(int(Acc)); NG.push_back(TfrI); diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index a3c8a882c0616..66c8b0a67169d 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -76,8 +76,7 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, OptLevel(TM.getOptLevel()), CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), - RegInfo(getHwMode()), TLInfo(TM, *this), - InstrItins(getInstrItineraryForCPU(CPUString)) { + TLInfo(TM, *this), InstrItins(getInstrItineraryForCPU(CPUString)) { Hexagon_MC::addArchSubtarget(this, FS); // Beware of the default constructor of InstrItineraryData: it will // reset all members to 0. diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 995f66d0551b4..7dfede249c63c 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -100,7 +100,6 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { // The following objects can use the TargetTriple, so they must be // declared after it. HexagonInstrInfo InstrInfo; - HexagonRegisterInfo RegInfo; HexagonTargetLowering TLInfo; HexagonSelectionDAGInfo TSInfo; HexagonFrameLowering FrameLowering; @@ -122,7 +121,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { } const HexagonInstrInfo *getInstrInfo() const override { return &InstrInfo; } const HexagonRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const HexagonTargetLowering *getTargetLowering() const override { return &TLInfo; @@ -295,6 +294,8 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { bool useBSBScheduling() const { return UseBSBScheduling; } bool enableMachineScheduler() const override; + bool enableTerminalRule() const override { return true; } + // Always use the TargetLowering default scheduler. // FIXME: This will use the vliw scheduler which is probably just hurting // compiler time and will be removed eventually anyway. diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index cb88d1ac4af9f..d39b79a86753a 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -653,7 +653,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI, const MCInstrDesc& MCID = PacketMI.getDesc(); // First operand is always the result. - const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI); + const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0); // Double regs can not feed into new value store: PRM section: 5.4.2.2. if (PacketRC == &Hexagon::DoubleRegsRegClass) return false; @@ -866,7 +866,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, return false; const MCInstrDesc& MCID = PI.getDesc(); - const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI); + const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0); if (DisableVecDblNVStores && VecRC == &Hexagon::HvxWRRegClass) return false; diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp index 02ed1001cd0d3..14b7557e7f94a 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp @@ -27,7 +27,8 @@ using namespace llvm; #include "LanaiGenInstrInfo.inc" LanaiInstrInfo::LanaiInstrInfo(const LanaiSubtarget &STI) - : LanaiGenInstrInfo(STI, Lanai::ADJCALLSTACKDOWN, Lanai::ADJCALLSTACKUP), + : LanaiGenInstrInfo(STI, RegisterInfo, Lanai::ADJCALLSTACKDOWN, + Lanai::ADJCALLSTACKUP), RegisterInfo() {} void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -48,8 +49,7 @@ void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void LanaiInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register SourceRegister, bool IsKill, int FrameIndex, - const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/, + const TargetRegisterClass *RegisterClass, Register /*VReg*/, MachineInstr::MIFlag /*Flags*/) const { DebugLoc DL; if (Position != MBB.end()) { @@ -69,8 +69,7 @@ void LanaiInstrInfo::storeRegToStackSlot( void LanaiInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register DestinationRegister, int FrameIndex, - const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo * /*RegisterInfo*/, Register /*VReg*/, + const TargetRegisterClass *RegisterClass, Register /*VReg*/, MachineInstr::MIFlag /*Flags*/) const { DebugLoc DL; if (Position != MBB.end()) { diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h index d98276243dc31..155e2f03be630 100644 --- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h +++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h @@ -58,15 +58,13 @@ class LanaiInstrInfo : public LanaiGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register SourceRegister, bool IsKill, int FrameIndex, - const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo *RegisterInfo, Register VReg, + const TargetRegisterClass *RegisterClass, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator Position, Register DestinationRegister, int FrameIndex, - const TargetRegisterClass *RegisterClass, - const TargetRegisterInfo *RegisterInfo, Register VReg, + const TargetRegisterClass *RegisterClass, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp index 0ccebeb393267..6358e348fe424 100644 --- a/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchDeadRegisterDefinitions.cpp @@ -60,7 +60,6 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction( return false; const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); LiveIntervals &LIS = getAnalysis().getLIS(); LLVM_DEBUG(dbgs() << "***** LoongArchDeadRegisterDefinitions *****\n"); @@ -86,7 +85,7 @@ bool LoongArchDeadRegisterDefinitions::runOnMachineFunction( continue; LLVM_DEBUG(dbgs() << " Dead def operand #" << I << " in:\n "; MI.print(dbgs())); - const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, I); if (!(RC && RC->contains(LoongArch::R0))) { LLVM_DEBUG(dbgs() << " Ignoring, register is not a GPR.\n"); continue; diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index 1493bf4cba695..690b0639484d0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -449,7 +449,7 @@ bool LoongArchFrameLowering::spillCalleeSavedRegisters( bool IsKill = !(Reg == LoongArch::R1 && MF->getFrameInfo().isReturnAddressTaken()); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC, TRI, + TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC, Register()); } diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 9a35df2f240c4..9a33dccd002c7 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -26,9 +26,9 @@ using namespace llvm; #include "LoongArchGenInstrInfo.inc" LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI) - : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN, + : LoongArchGenInstrInfo(STI, RegInfo, LoongArch::ADJCALLSTACKDOWN, LoongArch::ADJCALLSTACKUP), - STI(STI) {} + RegInfo(STI.getHwMode()), STI(STI) {} MCInst LoongArchInstrInfo::getNop() const { return MCInstBuilder(LoongArch::ANDI) @@ -113,14 +113,14 @@ void LoongArchInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void LoongArchInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool IsKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); unsigned Opcode; if (LoongArch::GPRRegClass.hasSubClassEq(RC)) - Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32 + Opcode = TRI.getRegSizeInBits(LoongArch::GPRRegClass) == 32 ? LoongArch::ST_W : LoongArch::ST_D; else if (LoongArch::FPR32RegClass.hasSubClassEq(RC)) @@ -149,8 +149,8 @@ void LoongArchInstrInfo::storeRegToStackSlot( void LoongArchInstrInfo::loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { + int FI, const TargetRegisterClass *RC, Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); DebugLoc DL; @@ -159,7 +159,7 @@ void LoongArchInstrInfo::loadRegFromStackSlot( unsigned Opcode; if (LoongArch::GPRRegClass.hasSubClassEq(RC)) - Opcode = TRI->getRegSizeInBits(LoongArch::GPRRegClass) == 32 + Opcode = RegInfo.getRegSizeInBits(LoongArch::GPRRegClass) == 32 ? LoongArch::LD_W : LoongArch::LD_D; else if (LoongArch::FPR32RegClass.hasSubClassEq(RC)) @@ -378,9 +378,12 @@ bool LoongArchInstrInfo::isBranchOffsetInRange(unsigned BranchOp, } } -bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const { +bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) + return true; + auto MII = MI.getIterator(); auto MIE = MBB->end(); @@ -426,25 +429,25 @@ bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI, auto MO2 = Lu32I->getOperand(2).getTargetFlags(); if (MO0 == LoongArchII::MO_PCREL_HI && MO1 == LoongArchII::MO_PCREL_LO && MO2 == LoongArchII::MO_PCREL64_LO) - return false; + return true; if ((MO0 == LoongArchII::MO_GOT_PC_HI || MO0 == LoongArchII::MO_LD_PC_HI || MO0 == LoongArchII::MO_GD_PC_HI) && MO1 == LoongArchII::MO_GOT_PC_LO && MO2 == LoongArchII::MO_GOT_PC64_LO) - return false; + return true; if (MO0 == LoongArchII::MO_IE_PC_HI && MO1 == LoongArchII::MO_IE_PC_LO && MO2 == LoongArchII::MO_IE_PC64_LO) - return false; + return true; if (MO0 == LoongArchII::MO_DESC_PC_HI && MO1 == LoongArchII::MO_DESC_PC_LO && MO2 == LoongArchII::MO_DESC64_PC_LO) - return false; + return true; break; } case LoongArch::LU52I_D: { auto MO = MI.getOperand(2).getTargetFlags(); if (MO == LoongArchII::MO_PCREL64_HI || MO == LoongArchII::MO_GOT_PC64_HI || MO == LoongArchII::MO_IE_PC64_HI || MO == LoongArchII::MO_DESC64_PC_HI) - return false; + return true; break; } default: @@ -484,7 +487,7 @@ bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI, auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2)); auto MO2 = LoongArchII::getDirectFlags(Ld->getOperand(2)); if (MO1 == LoongArchII::MO_DESC_PC_LO && MO2 == LoongArchII::MO_DESC_LD) - return false; + return true; break; } if (SecondOp == MIE || @@ -493,34 +496,34 @@ bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI, auto MO1 = LoongArchII::getDirectFlags(SecondOp->getOperand(2)); if (MO0 == LoongArchII::MO_PCREL_HI && SecondOp->getOpcode() == AddiOp && MO1 == LoongArchII::MO_PCREL_LO) - return false; + return true; if (MO0 == LoongArchII::MO_GOT_PC_HI && SecondOp->getOpcode() == LdOp && MO1 == LoongArchII::MO_GOT_PC_LO) - return false; + return true; if ((MO0 == LoongArchII::MO_LD_PC_HI || MO0 == LoongArchII::MO_GD_PC_HI) && SecondOp->getOpcode() == AddiOp && MO1 == LoongArchII::MO_GOT_PC_LO) - return false; + return true; break; } case LoongArch::ADDI_W: case LoongArch::ADDI_D: { auto MO = LoongArchII::getDirectFlags(MI.getOperand(2)); if (MO == LoongArchII::MO_PCREL_LO || MO == LoongArchII::MO_GOT_PC_LO) - return false; + return true; break; } case LoongArch::LD_W: case LoongArch::LD_D: { auto MO = LoongArchII::getDirectFlags(MI.getOperand(2)); if (MO == LoongArchII::MO_GOT_PC_LO) - return false; + return true; break; } case LoongArch::PseudoDESC_CALL: { auto MO = LoongArchII::getDirectFlags(MI.getOperand(2)); if (MO == LoongArchII::MO_DESC_CALL) - return false; + return true; break; } default: @@ -528,18 +531,6 @@ bool LoongArchInstrInfo::isSafeToMove(const MachineInstr &MI, } } - return true; -} - -bool LoongArchInstrInfo::isSchedulingBoundary(const MachineInstr &MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const { - if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) - return true; - - if (!isSafeToMove(MI, MBB, MF)) - return true; - return false; } @@ -665,13 +656,13 @@ void LoongArchInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, if (FrameIndex == -1) report_fatal_error("The function size is incorrectly estimated."); storeRegToStackSlot(MBB, PCALAU12I, Scav, /*IsKill=*/true, FrameIndex, - &LoongArch::GPRRegClass, TRI, Register()); + &LoongArch::GPRRegClass, Register()); TRI->eliminateFrameIndex(std::prev(PCALAU12I.getIterator()), /*SpAdj=*/0, /*FIOperandNum=*/1); PCALAU12I.getOperand(1).setMBB(&RestoreBB); ADDI.getOperand(2).setMBB(&RestoreBB); loadRegFromStackSlot(RestoreBB, RestoreBB.end(), Scav, FrameIndex, - &LoongArch::GPRRegClass, TRI, Register()); + &LoongArch::GPRRegClass, Register()); TRI->eliminateFrameIndex(RestoreBB.back(), /*SpAdj=*/0, /*FIOperandNum=*/1); } diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index e61314c034bdb..796ef9f3a5715 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -24,9 +24,13 @@ namespace llvm { class LoongArchSubtarget; class LoongArchInstrInfo : public LoongArchGenInstrInfo { + const LoongArchRegisterInfo RegInfo; + public: explicit LoongArchInstrInfo(const LoongArchSubtarget &STI); + const LoongArchRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -36,13 +40,11 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Materializes the given integer Val into DstReg. @@ -64,9 +66,6 @@ class LoongArchInstrInfo : public LoongArchGenInstrInfo { bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override; - bool isSafeToMove(const MachineInstr &MI, const MachineBasicBlock *MBB, - const MachineFunction &MF) const override; - bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp index 3acbe4992273a..76a8ba1c90e50 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.cpp @@ -95,4 +95,4 @@ LoongArchSubtarget::LoongArchSubtarget(const Triple &TT, StringRef CPU, : LoongArchGenSubtargetInfo(TT, CPU, TuneCPU, FS), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) {} + InstrInfo(*this), TLInfo(TM, *this) {} diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h index 5e12bafebb0d5..2beff07949daf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h @@ -45,7 +45,6 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown; LoongArchFrameLowering FrameLowering; LoongArchInstrInfo InstrInfo; - LoongArchRegisterInfo RegInfo; LoongArchTargetLowering TLInfo; SelectionDAGTargetInfo TSInfo; @@ -78,7 +77,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { } const LoongArchInstrInfo *getInstrInfo() const override { return &InstrInfo; } const LoongArchRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const LoongArchTargetLowering *getTargetLowering() const override { return &TLInfo; diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp index c6be190bd1245..91077ff5961a4 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp +++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp @@ -43,7 +43,7 @@ using namespace llvm; void M68kInstrInfo::anchor() {} M68kInstrInfo::M68kInstrInfo(const M68kSubtarget &STI) - : M68kGenInstrInfo(STI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0, + : M68kGenInstrInfo(STI, RI, M68k::ADJCALLSTACKDOWN, M68k::ADJCALLSTACKUP, 0, M68k::RET), Subtarget(STI), RI(STI) {} @@ -838,15 +838,14 @@ bool M68kInstrInfo::getStackSlotRange(const TargetRegisterClass *RC, void M68kInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo(); - assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIndex) >= TRI.getSpillSize(*RC) && "Stack slot is too small to store"); (void)MFI; - unsigned Opc = getStoreRegOpcode(SrcReg, RC, TRI, Subtarget); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, &TRI, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); // (0,FrameIndex) <- $reg M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIndex) @@ -857,15 +856,14 @@ void M68kInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DstReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { const MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo(); - assert(MFI.getObjectSize(FrameIndex) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIndex) >= TRI.getSpillSize(*RC) && "Stack slot is too small to load"); (void)MFI; - unsigned Opc = getLoadRegOpcode(DstReg, RC, TRI, Subtarget); + unsigned Opc = getLoadRegOpcode(DstReg, RC, &TRI, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); M68k::addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DstReg), FrameIndex); } diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h index 97615d60caa0b..2b3789d768602 100644 --- a/llvm/lib/Target/M68k/M68kInstrInfo.h +++ b/llvm/lib/Target/M68k/M68kInstrInfo.h @@ -280,14 +280,12 @@ class M68kInstrInfo : public M68kGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool IsKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 65b4820752c94..0fb4e9d9fcb62 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -26,13 +26,13 @@ using namespace llvm; void MSP430InstrInfo::anchor() {} MSP430InstrInfo::MSP430InstrInfo(const MSP430Subtarget &STI) - : MSP430GenInstrInfo(STI, MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP), + : MSP430GenInstrInfo(STI, RI, MSP430::ADJCALLSTACKDOWN, + MSP430::ADJCALLSTACKUP), RI() {} void MSP430InstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (MI != MBB.end()) DL = MI->getDebugLoc(); @@ -56,10 +56,12 @@ void MSP430InstrInfo::storeRegToStackSlot( llvm_unreachable("Cannot store this register to stack slot!"); } -void MSP430InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (MI != MBB.end()) DL = MI->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h index 316c136890bf8..c0a398452ef6d 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h @@ -42,13 +42,11 @@ class MSP430InstrInfo : public MSP430GenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIdx, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; unsigned getInstSizeInBytes(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp index aa94f54cdf9a0..d23ec57d46e17 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp @@ -37,11 +37,7 @@ using namespace llvm; #define DEBUG_TYPE "mips16-instrinfo" Mips16InstrInfo::Mips16InstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, Mips::Bimm16), RI(STI) {} - -const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const { - return RI; -} + : MipsInstrInfo(STI, RI, Mips::Bimm16), RI(STI) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -105,7 +101,6 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -120,10 +115,12 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB, .addMemOperand(MMO); } -void Mips16InstrInfo::loadRegFromStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - int64_t Offset, MachineInstr::MIFlag Flags) const { +void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + int64_t Offset, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad); diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h index 1058e8c25fb5b..4300d086f0614 100644 --- a/llvm/lib/Target/Mips/Mips16InstrInfo.h +++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h @@ -30,7 +30,7 @@ class Mips16InstrInfo : public MipsInstrInfo { public: explicit Mips16InstrInfo(const MipsSubtarget &STI); - const MipsRegisterInfo &getRegisterInfo() const override; + const Mips16RegisterInfo &getRegisterInfo() const { return RI; } /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -56,13 +56,14 @@ class Mips16InstrInfo : public MipsInstrInfo { void storeRegToStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + + int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp index bffdffa4af6a0..c879c46e49dd4 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -39,8 +39,9 @@ using namespace llvm; // Pin the vtable to this file. void MipsInstrInfo::anchor() {} -MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBr) - : MipsGenInstrInfo(STI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), +MipsInstrInfo::MipsInstrInfo(const MipsSubtarget &STI, + const MipsRegisterInfo &RI, unsigned UncondBr) + : MipsGenInstrInfo(STI, RI, Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP), Subtarget(STI), UncondBrOpc(UncondBr) {} const MipsInstrInfo *MipsInstrInfo::create(MipsSubtarget &STI) { diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index 2337ae7c079e7..0b90972977d5e 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -55,7 +55,8 @@ class MipsInstrInfo : public MipsGenInstrInfo { BT_Indirect // One indirct branch. }; - explicit MipsInstrInfo(const MipsSubtarget &STI, unsigned UncondBrOpc); + explicit MipsInstrInfo(const MipsSubtarget &STI, const MipsRegisterInfo &RI, + unsigned UncondBrOpc); MCInst getNop() const override; @@ -130,7 +131,10 @@ class MipsInstrInfo : public MipsGenInstrInfo { /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). - virtual const MipsRegisterInfo &getRegisterInfo() const = 0; + const MipsRegisterInfo &getRegisterInfo() const { + return static_cast( + TargetInstrInfo::getRegisterInfo()); + } virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0; @@ -143,31 +147,28 @@ class MipsInstrInfo : public MipsGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override { - storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0, Flags); + storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, 0, Flags); } void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override { - loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0, Flags); + loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, 0, Flags); } virtual void storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - int64_t Offset, + const TargetRegisterClass *RC, int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const = 0; virtual void loadRegFromStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + int FrameIndex, const TargetRegisterClass *RC, int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const = 0; virtual void adjustStackPtr(unsigned SP, int64_t Amount, diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index f08704a7e799c..942194cf31d44 100644 --- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -172,7 +172,7 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) { Register VR = MRI.createVirtualRegister(RC); Register Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex(); - TII.loadRegFromStack(MBB, I, VR, FI, RC, &RegInfo, 0); + TII.loadRegFromStack(MBB, I, VR, FI, RC, 0); BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), Dst) .addReg(VR, RegState::Kill); } @@ -189,7 +189,7 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) { BuildMI(MBB, I, I->getDebugLoc(), TII.get(TargetOpcode::COPY), VR) .addReg(Src, getKillRegState(I->getOperand(0).isKill())); - TII.storeRegToStack(MBB, I, VR, true, FI, RC, &RegInfo, 0); + TII.storeRegToStack(MBB, I, VR, true, FI, RC, 0); } void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I, @@ -210,9 +210,9 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I, DebugLoc DL = I->getDebugLoc(); const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); - TII.loadRegFromStack(MBB, I, VR0, FI, RC, &RegInfo, 0); + TII.loadRegFromStack(MBB, I, VR0, FI, RC, 0); BuildMI(MBB, I, DL, Desc, Lo).addReg(VR0, RegState::Kill); - TII.loadRegFromStack(MBB, I, VR1, FI, RC, &RegInfo, RegSize); + TII.loadRegFromStack(MBB, I, VR1, FI, RC, RegSize); BuildMI(MBB, I, DL, Desc, Hi).addReg(VR1, RegState::Kill); } @@ -234,9 +234,9 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I, DebugLoc DL = I->getDebugLoc(); BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src); - TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0); + TII.storeRegToStack(MBB, I, VR0, true, FI, RC, 0); BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill); - TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize); + TII.storeRegToStack(MBB, I, VR1, true, FI, RC, RegSize); } bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) { @@ -321,11 +321,9 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB, int FI = MF.getInfo()->getMoveF64ViaSpillFI(MF, RC2); if (!Subtarget.isLittle()) std::swap(LoReg, HiReg); - TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, - &RegInfo, 0); - TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, - &RegInfo, 4); - TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, 0); + TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC, 0); + TII.storeRegToStack(MBB, I, HiReg, I->getOperand(2).isKill(), FI, RC, 4); + TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, 0); return true; } @@ -385,8 +383,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, // We re-use the same spill slot each time so that the stack frame doesn't // grow too much in functions with a large number of moves. int FI = MF.getInfo()->getMoveF64ViaSpillFI(MF, RC); - TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0); - TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset); + TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, 0); + TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, Offset); return true; } @@ -480,8 +478,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, if (!MBB.isLiveIn(ABI.GetEhDataReg(I))) MBB.addLiveIn(ABI.GetEhDataReg(I)); TII.storeRegToStackSlot(MBB, MBBI, ABI.GetEhDataReg(I), false, - MipsFI->getEhDataRegFI(I), RC, &RegInfo, - Register()); + MipsFI->getEhDataRegFI(I), RC, Register()); } // Emit .cfi_offset directives for eh data registers. @@ -579,8 +576,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub( .setMIFlag(MachineInstr::FrameSetup); STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false, - MipsFI->getISRRegFI(0), PtrRC, - STI.getRegisterInfo(), 0); + MipsFI->getISRRegFI(0), PtrRC, 0); // Fetch and Spill Status MBB.addLiveIn(Mips::COP012); @@ -590,8 +586,7 @@ void MipsSEFrameLowering::emitInterruptPrologueStub( .setMIFlag(MachineInstr::FrameSetup); STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false, - MipsFI->getISRRegFI(1), PtrRC, - STI.getRegisterInfo(), 0); + MipsFI->getISRRegFI(1), PtrRC, 0); // Build the configuration for disabling lower priority interrupts. Non EIC // interrupts need to be masked off with zero, EIC from the Cause register. @@ -657,7 +652,6 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, const MipsSEInstrInfo &TII = *static_cast(STI.getInstrInfo()); - const MipsRegisterInfo &RegInfo = *STI.getRegisterInfo(); DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); MipsABIInfo ABI = STI.getABI(); @@ -690,8 +684,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, // Insert instructions that restore eh data registers. for (int J = 0; J < 4; ++J) { TII.loadRegFromStackSlot(MBB, I, ABI.GetEhDataReg(J), - MipsFI->getEhDataRegFI(J), RC, &RegInfo, - Register()); + MipsFI->getEhDataRegFI(J), RC, Register()); } } @@ -722,17 +715,15 @@ void MipsSEFrameLowering::emitInterruptEpilogueStub( BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB)); // Restore EPC - STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1, - MipsFI->getISRRegFI(0), PtrRC, - STI.getRegisterInfo(), Register()); + STI.getInstrInfo()->loadRegFromStackSlot( + MBB, MBBI, Mips::K1, MipsFI->getISRRegFI(0), PtrRC, Register()); BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014) .addReg(Mips::K1) .addImm(0); // Restore Status - STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1, - MipsFI->getISRRegFI(1), PtrRC, - STI.getRegisterInfo(), Register()); + STI.getInstrInfo()->loadRegFromStackSlot( + MBB, MBBI, Mips::K1, MipsFI->getISRRegFI(1), PtrRC, Register()); BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012) .addReg(Mips::K1) .addImm(0); @@ -795,7 +786,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters( // Insert the spill to the stack frame. bool IsKill = !IsRAAndRetAddrIsTaken; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, TRI, + TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, I.getFrameIdx(), RC, Register()); } diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp index dbdbb179a583d..a1d0aa089c089 100644 --- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -28,11 +28,7 @@ static unsigned getUnconditionalBranch(const MipsSubtarget &STI) { } MipsSEInstrInfo::MipsSEInstrInfo(const MipsSubtarget &STI) - : MipsInstrInfo(STI, getUnconditionalBranch(STI)), RI(STI) {} - -const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const { - return RI; -} + : MipsInstrInfo(STI, RI, getUnconditionalBranch(STI)), RI(STI) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -213,7 +209,6 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -239,16 +234,16 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB, Opc = Mips::SDC1; else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = Mips::SDC164; - else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8)) + else if (RI.isTypeLegalForClass(*RC, MVT::v16i8)) Opc = Mips::ST_B; - else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) || - TRI->isTypeLegalForClass(*RC, MVT::v8f16)) + else if (RI.isTypeLegalForClass(*RC, MVT::v8i16) || + RI.isTypeLegalForClass(*RC, MVT::v8f16)) Opc = Mips::ST_H; - else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) || - TRI->isTypeLegalForClass(*RC, MVT::v4f32)) + else if (RI.isTypeLegalForClass(*RC, MVT::v4i32) || + RI.isTypeLegalForClass(*RC, MVT::v4f32)) Opc = Mips::ST_W; - else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) || - TRI->isTypeLegalForClass(*RC, MVT::v2f64)) + else if (RI.isTypeLegalForClass(*RC, MVT::v2i64) || + RI.isTypeLegalForClass(*RC, MVT::v2f64)) Opc = Mips::ST_D; else if (Mips::LO32RegClass.hasSubClassEq(RC)) Opc = Mips::SW; @@ -285,10 +280,12 @@ void MipsSEInstrInfo::storeRegToStack(MachineBasicBlock &MBB, .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO); } -void MipsSEInstrInfo::loadRegFromStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - int64_t Offset, MachineInstr::MIFlag Flags) const { +void MipsSEInstrInfo::loadRegFromStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + int64_t Offset, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad); @@ -317,16 +314,16 @@ void MipsSEInstrInfo::loadRegFromStack( Opc = Mips::LDC1; else if (Mips::FGR64RegClass.hasSubClassEq(RC)) Opc = Mips::LDC164; - else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8)) + else if (RI.isTypeLegalForClass(*RC, MVT::v16i8)) Opc = Mips::LD_B; - else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) || - TRI->isTypeLegalForClass(*RC, MVT::v8f16)) + else if (RI.isTypeLegalForClass(*RC, MVT::v8i16) || + RI.isTypeLegalForClass(*RC, MVT::v8f16)) Opc = Mips::LD_H; - else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) || - TRI->isTypeLegalForClass(*RC, MVT::v4f32)) + else if (RI.isTypeLegalForClass(*RC, MVT::v4i32) || + RI.isTypeLegalForClass(*RC, MVT::v4f32)) Opc = Mips::LD_W; - else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) || - TRI->isTypeLegalForClass(*RC, MVT::v2f64)) + else if (RI.isTypeLegalForClass(*RC, MVT::v2i64) || + RI.isTypeLegalForClass(*RC, MVT::v2f64)) Opc = Mips::LD_D; else if (Mips::HI32RegClass.hasSubClassEq(RC)) Opc = Mips::LW; @@ -682,8 +679,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc, const MCInstrDesc &Desc = get(Opc); assert(Desc.NumOperands == 2 && "Unary instruction expected."); const MipsRegisterInfo *RI = &getRegisterInfo(); - unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI)); - unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI)); + unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0)); + unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1)); return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize); } diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h index 2b4f55d184b8b..5c48ccdc27f02 100644 --- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h @@ -24,7 +24,7 @@ class MipsSEInstrInfo : public MipsInstrInfo { public: explicit MipsSEInstrInfo(const MipsSubtarget &STI); - const MipsRegisterInfo &getRegisterInfo() const override; + const MipsSERegisterInfo &getRegisterInfo() const { return RI; } /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -50,13 +50,12 @@ class MipsSEInstrInfo : public MipsInstrInfo { void storeRegToStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStack( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, int64_t Offset, + int FrameIndex, const TargetRegisterClass *RC, int64_t Offset, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool expandPostRAPseudo(MachineInstr &MI) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 6840c7ae8faf4..db2d96f5ff532 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -26,7 +26,7 @@ using namespace llvm; void NVPTXInstrInfo::anchor() {} NVPTXInstrInfo::NVPTXInstrInfo(const NVPTXSubtarget &STI) - : NVPTXGenInstrInfo(STI), RegInfo() {} + : NVPTXGenInstrInfo(STI, RegInfo), RegInfo() {} void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 910bc9d281259..aae3e49f6c70b 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2520,11 +2520,11 @@ bool PPCFrameLowering::spillCalleeSavedRegisters( // saved vector registers. if (Subtarget.needsSwapsForVSXMemOps() && !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) - TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, - I.getFrameIdx(), RC, TRI); + TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), + RC); else TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, I.getFrameIdx(), RC, - TRI, Register()); + Register()); } } } @@ -2690,10 +2690,9 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters( // saved vector registers. if (Subtarget.needsSwapsForVSXMemOps() && !MF->getFunction().hasFnAttribute(Attribute::NoUnwind)) - TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC, - TRI); + TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC); else - TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI, + TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, Register()); assert(I != MBB.begin() && diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 3014aa6bfe31e..366a7b6d0135a 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -89,7 +89,7 @@ static cl::opt EnableFMARegPressureReduction( void PPCInstrInfo::anchor() {} PPCInstrInfo::PPCInstrInfo(const PPCSubtarget &STI) - : PPCGenInstrInfo(STI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, + : PPCGenInstrInfo(STI, RI, PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP, /* CatchRetOpcode */ -1, STI.isPPC64() ? PPC::BLR8 : PPC::BLR), Subtarget(STI), RI(STI.getTargetMachine()) {} @@ -2014,8 +2014,7 @@ void PPCInstrInfo::StoreRegToStackSlot( void PPCInstrInfo::storeRegToStackSlotNoUpd( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, - bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { + bool isKill, int FrameIdx, const TargetRegisterClass *RC) const { MachineFunction &MF = *MBB.getParent(); SmallVector NewMIs; @@ -2034,8 +2033,7 @@ void PPCInstrInfo::storeRegToStackSlotNoUpd( void PPCInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIdx, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags) const { // We need to avoid a situation in which the value from a VRRC register is // spilled using an Altivec instruction and reloaded into a VSRC register @@ -2045,7 +2043,7 @@ void PPCInstrInfo::storeRegToStackSlot( // the register is defined using an Altivec instruction and is then used by a // VSX instruction. RC = updatedRC(RC); - storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC, TRI); + storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC); } void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, @@ -2060,8 +2058,7 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL, void PPCInstrInfo::loadRegFromStackSlotNoUpd( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, - int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { + int FrameIdx, const TargetRegisterClass *RC) const { MachineFunction &MF = *MBB.getParent(); SmallVector NewMIs; DebugLoc DL; @@ -2080,10 +2077,12 @@ void PPCInstrInfo::loadRegFromStackSlotNoUpd( NewMIs.back()->addMemOperand(MF, MMO); } -void PPCInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { // We need to avoid a situation in which the value from a VRRC register is // spilled using an Altivec instruction and reloaded into a VSRC register // using a VSX instruction. The issue with this is that the VSX @@ -2093,7 +2092,7 @@ void PPCInstrInfo::loadRegFromStackSlot( // VSX instruction. RC = updatedRC(RC); - loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC, TRI); + loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC); } bool PPCInstrInfo:: diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index d67fc28935586..8b824bc219ab2 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -570,7 +570,8 @@ class PPCInstrInfo : public PPCGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Emits a register spill without updating the register class for vector @@ -579,13 +580,13 @@ class PPCInstrInfo : public PPCGenInstrInfo { void storeRegToStackSlotNoUpd(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; + const TargetRegisterClass *RC) const; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Emits a register reload without updating the register class for vector @@ -594,8 +595,7 @@ class PPCInstrInfo : public PPCGenInstrInfo { void loadRegFromStackSlotNoUpd(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const; + const TargetRegisterClass *RC) const; unsigned getStoreOpcodeForSpill(const TargetRegisterClass *RC) const; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 85b40727ff296..b3a7c829958ec 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -2023,7 +2023,7 @@ Register PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *RC = getPointerRegClass(); Register BaseReg = MRI.createVirtualRegister(RC); - MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this)); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0)); BuildMI(*MBB, Ins, DL, MCID, BaseReg) .addFrameIndex(FrameIdx).addImm(Offset); @@ -2051,7 +2051,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); const MCInstrDesc &MCID = MI.getDesc(); MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum, this)); + MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, FIOperandNum)); } bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp index 26f434b528584..cedaa8679ff1b 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp @@ -79,6 +79,32 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI, } } + if (STI.hasFeature(RISCV::FeatureStdExtP)) { + // Check if the immediate is packed i8 or i10 + int32_t Bit63To32 = Val >> 32; + int32_t Bit31To0 = Val; + int16_t Bit31To16 = Bit31To0 >> 16; + int16_t Bit15To0 = Bit31To0; + int8_t Bit15To8 = Bit15To0 >> 8; + int8_t Bit7To0 = Bit15To0; + if (Bit63To32 == Bit31To0) { + if (IsRV64 && isInt<10>(Bit63To32)) { + Res.emplace_back(RISCV::PLI_W, Bit63To32); + return; + } + if (Bit31To16 == Bit15To0) { + if (isInt<10>(Bit31To16)) { + Res.emplace_back(RISCV::PLI_H, Bit31To16); + return; + } + if (Bit15To8 == Bit7To0) { + Res.emplace_back(RISCV::PLI_B, Bit15To8); + return; + } + } + } + } + if (isInt<32>(Val)) { // Depending on the active bits in the immediate Value v, the following // instruction sequences are emitted: @@ -562,6 +588,9 @@ OpndKind Inst::getOpndKind() const { case RISCV::LUI: case RISCV::QC_LI: case RISCV::QC_E_LI: + case RISCV::PLI_B: + case RISCV::PLI_H: + case RISCV::PLI_W: return RISCVMatInt::Imm; case RISCV::ADD_UW: return RISCVMatInt::RegX0; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h index a82cd650f42fa..5df8edb2ee85a 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h @@ -21,7 +21,7 @@ namespace RISCVMatInt { enum OpndKind { RegImm, // ADDI/ADDIW/XORI/SLLI/SRLI/SLLI_UW/RORI/BSETI/BCLRI/TH_SRRI - Imm, // LUI/QC_LI/QC_E_LI + Imm, // LUI/QC_LI/QC_E_LI/PLI_B/PLI_H/PLI_W RegReg, // SH1ADD/SH2ADD/SH3ADD/PACK RegX0, // ADD_UW }; diff --git a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp index 51180f548ca6d..5d3d9b5c4cf03 100644 --- a/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp +++ b/llvm/lib/Target/RISCV/RISCVDeadRegisterDefinitions.cpp @@ -59,7 +59,6 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { return false; const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); LiveIntervals &LIS = getAnalysis().getLIS(); LLVM_DEBUG(dbgs() << "***** RISCVDeadRegisterDefinitions *****\n"); @@ -89,7 +88,7 @@ bool RISCVDeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << " Dead def operand #" << I << " in:\n "; MI.print(dbgs())); Register X0Reg; - const TargetRegisterClass *RC = TII->getRegClass(Desc, I, TRI); + const TargetRegisterClass *RC = TII->getRegClass(Desc, I); if (RC && RC->contains(RISCV::X0)) { X0Reg = RISCV::X0; } else if (RC && RC->contains(RISCV::X0_W)) { diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index f881c4c79d444..f7fc9528920a6 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -291,12 +291,12 @@ static void emitSiFiveCLICPreemptibleSaves(MachineFunction &MF, // which affects other passes. TII->storeRegToStackSlot(MBB, MBBI, RISCV::X8, /* IsKill=*/true, RVFI->getInterruptCSRFrameIndex(0), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + &RISCV::GPRRegClass, Register(), + MachineInstr::FrameSetup); TII->storeRegToStackSlot(MBB, MBBI, RISCV::X9, /* IsKill=*/true, RVFI->getInterruptCSRFrameIndex(1), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + &RISCV::GPRRegClass, Register(), + MachineInstr::FrameSetup); // Put `mcause` into X8 (s0), and `mepc` into X9 (s1). If either of these are // used in the function, then they will appear in `getUnmanagedCSI` and will @@ -357,14 +357,12 @@ static void emitSiFiveCLICPreemptibleRestores(MachineFunction &MF, // X8 and X9 need to be restored to their values on function entry, which we // saved onto the stack in `emitSiFiveCLICPreemptibleSaves`. - TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X9, - RVFI->getInterruptCSRFrameIndex(1), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); - TII->loadRegFromStackSlot(MBB, MBBI, RISCV::X8, - RVFI->getInterruptCSRFrameIndex(0), - &RISCV::GPRRegClass, STI.getRegisterInfo(), - Register(), MachineInstr::FrameSetup); + TII->loadRegFromStackSlot( + MBB, MBBI, RISCV::X9, RVFI->getInterruptCSRFrameIndex(1), + &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup); + TII->loadRegFromStackSlot( + MBB, MBBI, RISCV::X8, RVFI->getInterruptCSRFrameIndex(0), + &RISCV::GPRRegClass, Register(), MachineInstr::FrameSetup); } // Get the ID of the libcall used for spilling and restoring callee saved @@ -2177,7 +2175,7 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg), - CS.getFrameIdx(), RC, TRI, Register(), + CS.getFrameIdx(), RC, Register(), MachineInstr::FrameSetup); } }; @@ -2267,8 +2265,8 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( for (auto &CS : CSInfo) { MCRegister Reg = CS.getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI, - Register(), MachineInstr::FrameDestroy); + TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, Register(), + MachineInstr::FrameDestroy); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); } diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 907833513c5d1..1cbedb7d141e2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -991,6 +991,18 @@ static unsigned getSegInstNF(unsigned Intrinsic) { } } +static bool isApplicableToPLI(int Val) { + // Check if the immediate is packed i8 or i10 + int16_t Bit31To16 = Val >> 16; + int16_t Bit15To0 = Val; + int8_t Bit15To8 = Bit15To0 >> 8; + int8_t Bit7To0 = Val; + if (Bit31To16 != Bit15To0) + return false; + + return isInt<10>(Bit31To16) || Bit15To8 == Bit7To0; +} + void RISCVDAGToDAGISel::Select(SDNode *Node) { // If we have a custom node, we have already selected. if (Node->isMachineOpcode()) { @@ -1034,6 +1046,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node)) Imm = SignExtend64<32>(Imm); + if (Subtarget->enablePExtCodeGen() && isApplicableToPLI(Imm) && + hasAllWUsers(Node)) { + // If it's 4 packed 8-bit integers or 2 packed signed 16-bit integers, we + // can simply copy lower 32 bits to higher 32 bits to make it able to + // rematerialize to PLI_B or PLI_H + Imm = ((uint64_t)Imm << 32) | (Imm & 0xFFFFFFFF); + } + ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget).getNode()); return; } @@ -2654,6 +2674,21 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { CurDAG->RemoveDeadNode(Node); return; } + if (Subtarget->enablePExtCodeGen()) { + bool Is32BitCast = + (VT == MVT::i32 && (SrcVT == MVT::v4i8 || SrcVT == MVT::v2i16)) || + (SrcVT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16)); + bool Is64BitCast = + (VT == MVT::i64 && (SrcVT == MVT::v8i8 || SrcVT == MVT::v4i16 || + SrcVT == MVT::v2i32)) || + (SrcVT == MVT::i64 && + (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)); + if (Is32BitCast || Is64BitCast) { + ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); + CurDAG->RemoveDeadNode(Node); + return; + } + } break; } case ISD::INSERT_SUBVECTOR: diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a3ccbd8d4a8aa..637f1943b8511 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -284,6 +284,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::riscv_nxv32i8x2, &RISCV::VRN2M4RegClass); } + // fixed vector is stored in GPRs for P extension packed operations + if (Subtarget.enablePExtCodeGen()) { + if (Subtarget.is64Bit()) { + addRegisterClass(MVT::v2i32, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass); + } else { + addRegisterClass(MVT::v2i16, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); + } + } + // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); @@ -492,6 +504,34 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::FTRUNC, ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN, ISD::FCANONICALIZE}; + if (Subtarget.enablePExtCodeGen()) { + setTargetDAGCombine(ISD::TRUNCATE); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + SmallVector VTs; + if (Subtarget.is64Bit()) { + VTs.append({MVT::v2i32, MVT::v4i16, MVT::v8i8}); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + setOperationAction(ISD::LOAD, MVT::v2i16, Custom); + setOperationAction(ISD::LOAD, MVT::v4i8, Custom); + } else { + VTs.append({MVT::v2i16, MVT::v4i8}); + } + setOperationAction(ISD::UADDSAT, VTs, Legal); + setOperationAction(ISD::SADDSAT, VTs, Legal); + setOperationAction(ISD::USUBSAT, VTs, Legal); + setOperationAction(ISD::SSUBSAT, VTs, Legal); + setOperationAction({ISD::AVGFLOORS, ISD::AVGFLOORU}, VTs, Legal); + setOperationAction({ISD::ABDS, ISD::ABDU}, VTs, Legal); + setOperationAction(ISD::BUILD_VECTOR, VTs, Custom); + setOperationAction(ISD::BITCAST, VTs, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VTs, Custom); + } + if (Subtarget.hasStdExtZfbfmin()) { setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::ConstantFP, MVT::bf16, Expand); @@ -1776,6 +1816,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, MaxLoadsPerMemcmp = Subtarget.getMaxLoadsPerMemcmp(/*OptSize=*/false); } +TargetLoweringBase::LegalizeTypeAction +RISCVTargetLowering::getPreferredVectorAction(MVT VT) const { + if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen()) + if (VT == MVT::v2i16 || VT == MVT::v4i8) + return TypeWidenVector; + + return TargetLoweringBase::getPreferredVectorAction(VT); +} + EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const { @@ -4391,6 +4440,37 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, MVT XLenVT = Subtarget.getXLenVT(); SDLoc DL(Op); + // Handle P extension packed vector BUILD_VECTOR with PLI for splat constants + if (Subtarget.enablePExtCodeGen()) { + bool IsPExtVector = + (VT == MVT::v2i16 || VT == MVT::v4i8) || + (Subtarget.is64Bit() && + (VT == MVT::v4i16 || VT == MVT::v8i8 || VT == MVT::v2i32)); + if (IsPExtVector) { + if (SDValue SplatValue = cast(Op)->getSplatValue()) { + if (auto *C = dyn_cast(SplatValue)) { + int64_t SplatImm = C->getSExtValue(); + bool IsValidImm = false; + + // Check immediate range based on vector type + if (VT == MVT::v8i8 || VT == MVT::v4i8) { + // PLI_B uses 8-bit unsigned or unsigned immediate + IsValidImm = isUInt<8>(SplatImm) || isInt<8>(SplatImm); + if (isUInt<8>(SplatImm)) + SplatImm = (int8_t)SplatImm; + } else { + // PLI_H and PLI_W use 10-bit signed immediate + IsValidImm = isInt<10>(SplatImm); + } + + if (IsValidImm) { + SDValue Imm = DAG.getSignedTargetConstant(SplatImm, DL, XLenVT); + return DAG.getNode(RISCVISD::PLI, DL, VT, Imm); + } + } + } + } + } // Proper support for f16 requires Zvfh. bf16 always requires special // handling. We need to cast the scalar to integer and create an integer @@ -7546,6 +7626,19 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi); } + if (Subtarget.enablePExtCodeGen()) { + bool Is32BitCast = + (VT == MVT::i32 && (Op0VT == MVT::v4i8 || Op0VT == MVT::v2i16)) || + (Op0VT == MVT::i32 && (VT == MVT::v4i8 || VT == MVT::v2i16)); + bool Is64BitCast = + (VT == MVT::i64 && (Op0VT == MVT::v8i8 || Op0VT == MVT::v4i16 || + Op0VT == MVT::v2i32)) || + (Op0VT == MVT::i64 && + (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)); + if (Is32BitCast || Is64BitCast) + return Op; + } + // Consider other scalar<->scalar casts as legal if the types are legal. // Otherwise expand them. if (!VT.isVector() && !Op0VT.isVector()) { @@ -8218,6 +8311,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, auto *Store = cast(Op); SDValue StoredVal = Store->getValue(); EVT VT = StoredVal.getValueType(); + if (Subtarget.enablePExtCodeGen()) { + if (VT == MVT::v2i16 || VT == MVT::v4i8) { + SDValue DL(Op); + SDValue Cast = DAG.getBitcast(MVT::i32, StoredVal); + SDValue NewStore = + DAG.getStore(Store->getChain(), DL, Cast, Store->getBasePtr(), + Store->getPointerInfo(), Store->getBaseAlign(), + Store->getMemOperand()->getFlags()); + return NewStore; + } + } if (VT == MVT::f64) { assert(Subtarget.hasStdExtZdinx() && !Subtarget.hasStdExtZilsd() && !Subtarget.is64Bit() && "Unexpected custom legalisation"); @@ -10500,6 +10604,17 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getNode(RISCVISD::FMV_H_X, DL, EltVT, IntExtract); } + if (Subtarget.enablePExtCodeGen() && VecVT.isFixedLengthVector()) { + if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 && + VecVT != MVT::v4i8 && VecVT != MVT::v2i32) + return SDValue(); + SDValue Extracted = DAG.getBitcast(XLenVT, Vec); + unsigned ElemWidth = EltVT.getSizeInBits(); + SDValue Shamt = DAG.getNode(ISD::MUL, DL, XLenVT, Idx, + DAG.getConstant(ElemWidth, DL, XLenVT)); + return DAG.getNode(ISD::SRL, DL, XLenVT, Extracted, Shamt); + } + // If this is a fixed vector, we need to convert it to a scalable vector. MVT ContainerVT = VecVT; if (VecVT.isFixedLengthVector()) { @@ -14642,6 +14757,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, return; } + if (Subtarget.is64Bit() && Subtarget.enablePExtCodeGen()) { + SDLoc DL(N); + SDValue ExtLoad = + DAG.getExtLoad(ISD::SEXTLOAD, DL, MVT::i64, Ld->getChain(), + Ld->getBasePtr(), MVT::i32, Ld->getMemOperand()); + if (N->getValueType(0) == MVT::v2i16) { + Results.push_back(DAG.getBitcast(MVT::v4i16, ExtLoad)); + Results.push_back(ExtLoad.getValue(1)); + } else if (N->getValueType(0) == MVT::v4i8) { + Results.push_back(DAG.getBitcast(MVT::v8i8, ExtLoad)); + Results.push_back(ExtLoad.getValue(1)); + } + return; + } + assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() && "Unexpected custom legalisation"); @@ -14997,6 +15127,21 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, NewRes)); break; } + case RISCVISD::PASUB: + case RISCVISD::PASUBU: { + MVT VT = N->getSimpleValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + assert(VT == MVT::v2i16 || VT == MVT::v4i8); + MVT NewVT = MVT::v4i16; + if (VT == MVT::v4i8) + NewVT = MVT::v8i8; + SDValue Undef = DAG.getUNDEF(VT); + Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op0, Undef}); + Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, NewVT, {Op1, Undef}); + Results.push_back(DAG.getNode(N->getOpcode(), DL, NewVT, {Op0, Op1})); + return; + } case ISD::EXTRACT_VECTOR_ELT: { // Custom-legalize an EXTRACT_VECTOR_ELT where XLEN PASUB/PASUBU +static SDValue combinePExtTruncate(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + if (N0.getOpcode() != ISD::SRL) + return SDValue(); + + MVT VecVT = VT.getSimpleVT(); + if (VecVT != MVT::v4i16 && VecVT != MVT::v2i16 && VecVT != MVT::v8i8 && + VecVT != MVT::v4i8 && VecVT != MVT::v2i32) + return SDValue(); + + // Check if shift amount is 1 + SDValue ShAmt = N0.getOperand(1); + if (ShAmt.getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + BuildVectorSDNode *BV = dyn_cast(ShAmt.getNode()); + if (!BV) + return SDValue(); + SDValue Splat = BV->getSplatValue(); + if (!Splat) + return SDValue(); + ConstantSDNode *C = dyn_cast(Splat); + if (!C) + return SDValue(); + if (C->getZExtValue() != 1) + return SDValue(); + + // Check for SUB operation + SDValue Sub = N0.getOperand(0); + if (Sub.getOpcode() != ISD::SUB) + return SDValue(); + + SDValue LHS = Sub.getOperand(0); + SDValue RHS = Sub.getOperand(1); + + // Check if both operands are sign/zero extends from the target + // type + bool IsSignExt = LHS.getOpcode() == ISD::SIGN_EXTEND && + RHS.getOpcode() == ISD::SIGN_EXTEND; + bool IsZeroExt = LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() == ISD::ZERO_EXTEND; + + if (!IsSignExt && !IsZeroExt) + return SDValue(); + + SDValue A = LHS.getOperand(0); + SDValue B = RHS.getOperand(0); + + // Check if the extends are from our target vector type + if (A.getValueType() != VT || B.getValueType() != VT) + return SDValue(); + + // Determine the instruction based on type and signedness + unsigned Opc; + if (IsSignExt) + Opc = RISCVISD::PASUB; + else if (IsZeroExt) + Opc = RISCVISD::PASUBU; + else + return SDValue(); + + // Create the machine node directly + return DAG.getNode(Opc, SDLoc(N), VT, {A, B}); +} + static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); + if (VT.isFixedLengthVector() && Subtarget.enablePExtCodeGen()) + return combinePExtTruncate(N, DAG, Subtarget); + // Pre-promote (i1 (truncate (srl X, Y))) on RV64 with Zbs without zero // extending X. This is safe since we only need the LSB after the shift and // shift amounts larger than 31 would produce poison. If we wait until @@ -22203,8 +22421,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, MachineFunction &MF = *BB->getParent(); DebugLoc DL = MI.getDebugLoc(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const RISCVInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); Register LoReg = MI.getOperand(0).getReg(); Register HiReg = MI.getOperand(1).getReg(); Register SrcReg = MI.getOperand(2).getReg(); @@ -22213,7 +22430,7 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI, int FI = MF.getInfo()->getMoveF64FrameIndex(MF); TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC, - RI, Register()); + Register()); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMOLo = MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 4, Align(8)); @@ -22239,8 +22456,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, MachineFunction &MF = *BB->getParent(); DebugLoc DL = MI.getDebugLoc(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const RISCVInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); Register DstReg = MI.getOperand(0).getReg(); Register LoReg = MI.getOperand(1).getReg(); Register HiReg = MI.getOperand(2).getReg(); @@ -22263,7 +22479,7 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI, .addFrameIndex(FI) .addImm(4) .addMemOperand(MMOHi); - TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, RI, Register()); + TII.loadRegFromStackSlot(*BB, MI, DstReg, FI, DstRC, Register()); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index dd62a9cf6c9e2..5cc427c867cfd 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -71,6 +71,9 @@ class RISCVTargetLowering : public TargetLowering { bool preferScalarizeSplat(SDNode *N) const override; + /// Customize the preferred legalization strategy for certain types. + LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + bool softPromoteHalfType() const override { return true; } /// Return the register type for a given MVT, ensuring vectors are treated diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index b05956b674d18..9d5421241bf0d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -82,8 +82,9 @@ namespace llvm::RISCV { } // end namespace llvm::RISCV RISCVInstrInfo::RISCVInstrInfo(const RISCVSubtarget &STI) - : RISCVGenInstrInfo(STI, RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP), - STI(STI) {} + : RISCVGenInstrInfo(STI, RegInfo, RISCV::ADJCALLSTACKDOWN, + RISCV::ADJCALLSTACKUP), + RegInfo(STI.getHwMode()), STI(STI) {} #define GET_INSTRINFO_HELPERS #include "RISCVGenInstrInfo.inc" @@ -638,7 +639,6 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool IsKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); @@ -646,8 +646,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned Opcode; if (RISCV::GPRRegClass.hasSubClassEq(RC)) { - Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? - RISCV::SW : RISCV::SD; + Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW + : RISCV::SD; } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::SH_INX; } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) { @@ -704,7 +704,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FI) .addMemOperand(MMO) .setMIFlag(Flags); - NumVRegSpilled += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; + NumVRegSpilled += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; } else { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, @@ -719,10 +719,12 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, } } -void RISCVInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DstReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DstReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { MachineFunction *MF = MBB.getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); DebugLoc DL = @@ -730,8 +732,8 @@ void RISCVInstrInfo::loadRegFromStackSlot( unsigned Opcode; if (RISCV::GPRRegClass.hasSubClassEq(RC)) { - Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? - RISCV::LW : RISCV::LD; + Opcode = RegInfo.getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW + : RISCV::LD; } else if (RISCV::GPRF16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::LH_INX; } else if (RISCV::GPRF32RegClass.hasSubClassEq(RC)) { @@ -787,7 +789,7 @@ void RISCVInstrInfo::loadRegFromStackSlot( .addFrameIndex(FI) .addMemOperand(MMO) .setMIFlag(Flags); - NumVRegReloaded += TRI->getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; + NumVRegReloaded += RegInfo.getRegSizeInBits(*RC) / RISCV::RVVBitsPerBlock; } else { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, @@ -1378,14 +1380,14 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, report_fatal_error("underestimated function size"); storeRegToStackSlot(MBB, MI, TmpGPR, /*IsKill=*/true, FrameIndex, - &RISCV::GPRRegClass, TRI, Register()); + &RISCV::GPRRegClass, Register()); TRI->eliminateFrameIndex(std::prev(MI.getIterator()), /*SpAdj=*/0, /*FIOperandNum=*/1); MI.getOperand(1).setMBB(&RestoreBB); loadRegFromStackSlot(RestoreBB, RestoreBB.end(), TmpGPR, FrameIndex, - &RISCV::GPRRegClass, TRI, Register()); + &RISCV::GPRRegClass, Register()); TRI->eliminateFrameIndex(RestoreBB.back(), /*SpAdj=*/0, /*FIOperandNum=*/1); } @@ -2913,6 +2915,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM9_LSB000: Ok = isShiftedUInt<6, 3>(Imm); break; + case RISCVOp::OPERAND_SIMM8_UNSIGNED: + Ok = isInt<8>(Imm); + break; case RISCVOp::OPERAND_SIMM10_LSB0000_NONZERO: Ok = isShiftedInt<6, 4>(Imm) && (Imm != 0); break; @@ -2934,6 +2939,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, // clang-format off CASE_OPERAND_SIMM(5) CASE_OPERAND_SIMM(6) + CASE_OPERAND_SIMM(10) CASE_OPERAND_SIMM(11) CASE_OPERAND_SIMM(12) CASE_OPERAND_SIMM(26) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index c5eddb9e90fbf..0ffe015b9fac8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -79,10 +79,13 @@ enum RISCVMachineCombinerPattern : unsigned { }; class RISCVInstrInfo : public RISCVGenInstrInfo { + const RISCVRegisterInfo RegInfo; public: explicit RISCVInstrInfo(const RISCVSubtarget &STI); + const RISCVRegisterInfo &getRegisterInfo() const { return RegInfo; } + MCInst getNop() const override; Register isLoadFromStackSlot(const MachineInstr &MI, @@ -113,13 +116,13 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool IsKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DstReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; using TargetInstrInfo::foldMemoryOperandImpl; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index 4cbbba3aa68cb..7637047aabf2d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -18,7 +18,7 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def simm10 : RISCVSImmOp<10>; +def simm10 : RISCVSImmOp<10>, TImmLeaf(Imm);">; def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { let RenderMethod = "addSImm8UnsignedOperands"; @@ -26,7 +26,7 @@ def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { // A 8-bit signed immediate allowing range [-128, 255] // but represented as [-128, 127]. -def simm8_unsigned : RISCVOp { +def simm8_unsigned : RISCVOp, TImmLeaf(Imm);"> { let ParserMatchClass = SImm8UnsignedAsmOperand; let EncoderMethod = "getImmOpValue"; let DecoderMethod = "decodeSImmOperand<8>"; @@ -1463,8 +1463,91 @@ let Predicates = [HasStdExtP, IsRV32] in { def riscv_absw : RVSDNode<"ABSW", SDTIntUnaryOp>; -let Predicates = [HasStdExtP] in -def : PatGpr; +def SDT_RISCVPLI : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisInt<0>, + SDTCisInt<1>]>; +def riscv_pli : RVSDNode<"PLI", SDT_RISCVPLI>; +def SDT_RISCVPASUB : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisInt<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def riscv_pasub : RVSDNode<"PASUB", SDT_RISCVPASUB>; +def riscv_pasubu : RVSDNode<"PASUBU", SDT_RISCVPASUB>; -let Predicates = [HasStdExtP, IsRV64] in -def : PatGpr; +let Predicates = [HasStdExtP] in { + def : PatGpr; + + // Basic 8-bit arithmetic patterns + def: Pat<(XLenVecI8VT (add GPR:$rs1, GPR:$rs2)), (PADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_B GPR:$rs1, GPR:$rs2)>; + + // Basic 16-bit arithmetic patterns + def: Pat<(XLenVecI16VT (add GPR:$rs1, GPR:$rs2)), (PADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (sub GPR:$rs1, GPR:$rs2)), (PSUB_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit saturating add/sub patterns + def: Pat<(XLenVecI8VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit saturating add/sub patterns + def: Pat<(XLenVecI16VT (saddsat GPR:$rs1, GPR:$rs2)), (PSADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (uaddsat GPR:$rs1, GPR:$rs2)), (PSADDU_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (ssubsat GPR:$rs1, GPR:$rs2)), (PSSUB_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (usubsat GPR:$rs1, GPR:$rs2)), (PSSUBU_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit averaging patterns + def: Pat<(XLenVecI8VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit averaging patterns + def: Pat<(XLenVecI16VT (avgfloors GPR:$rs1, GPR:$rs2)), (PAADD_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (avgflooru GPR:$rs1, GPR:$rs2)), (PAADDU_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_H GPR:$rs1, GPR:$rs2)>; + + // 8-bit absolute difference patterns + def: Pat<(XLenVecI8VT (abds GPR:$rs1, GPR:$rs2)), (PDIF_B GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI8VT (abdu GPR:$rs1, GPR:$rs2)), (PDIFU_B GPR:$rs1, GPR:$rs2)>; + + // 16-bit absolute difference patterns + def: Pat<(XLenVecI16VT (abds GPR:$rs1, GPR:$rs2)), (PDIF_H GPR:$rs1, GPR:$rs2)>; + def: Pat<(XLenVecI16VT (abdu GPR:$rs1, GPR:$rs2)), (PDIFU_H GPR:$rs1, GPR:$rs2)>; + + + // 8-bit PLI SD node pattern + def: Pat<(XLenVecI8VT (riscv_pli simm8_unsigned:$imm8)), (PLI_B simm8_unsigned:$imm8)>; + // 16-bit PLI SD node pattern + def: Pat<(XLenVecI16VT (riscv_pli simm10:$imm10)), (PLI_H simm10:$imm10)>; + +} // Predicates = [HasStdExtP] + +let Predicates = [HasStdExtP, IsRV32] in { + // Load/Store patterns + def : StPat; + def : StPat; + def : LdPat; + def : LdPat; +} // Predicates = [HasStdExtP, IsRV32] + +let Predicates = [HasStdExtP, IsRV64] in { + def : PatGpr; + + // 32-bit PLI SD node pattern + def: Pat<(v2i32 (riscv_pli simm10:$imm10)), (PLI_W simm10:$imm10)>; + + // 32-bit averaging-sub patterns + def: Pat<(v2i32 (riscv_pasub GPR:$rs1, GPR:$rs2)), (PASUB_W GPR:$rs1, GPR:$rs2)>; + def: Pat<(v2i32 (riscv_pasubu GPR:$rs1, GPR:$rs2)), (PASUBU_W GPR:$rs1, GPR:$rs2)>; + + // Load/Store patterns + def : StPat; + def : StPat; + def : StPat; + def : LdPat; + def : LdPat; + def : LdPat; +} // Predicates = [HasStdExtP, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 6605a5ccdfde2..87095e75d5dc4 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -222,6 +222,12 @@ def XLenFVT : ValueTypeByHwMode<[RV64], [f64]>; def XLenPairFVT : ValueTypeByHwMode<[RV32], [f64]>; + +// P extension +def XLenVecI8VT : ValueTypeByHwMode<[RV32, RV64], + [v4i8, v8i8]>; +def XLenVecI16VT : ValueTypeByHwMode<[RV32, RV64], + [v2i16, v4i16]>; def XLenRI : RegInfoByHwMode< [RV32, RV64], [RegInfo<32,32,32>, RegInfo<64,64,64>]>; @@ -238,7 +244,9 @@ class RISCVRegisterClass regTypes, int align, dag regList> } class GPRRegisterClass - : RISCVRegisterClass<[XLenVT, XLenFVT], 32, regList> { + : RISCVRegisterClass<[XLenVT, XLenFVT, + // P extension packed vector types: + XLenVecI8VT, XLenVecI16VT, v2i32], 32, regList> { let RegInfos = XLenRI; } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 715ac4cedc649..926cc9ea547a6 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -69,6 +69,12 @@ static cl::opt UseMIPSCCMovInsn("use-riscv-mips-ccmov", cl::desc("Use 'mips.ccmov' instruction"), cl::init(true), cl::Hidden); +static cl::opt EnablePExtCodeGen( + "enable-p-ext-codegen", + cl::desc("Turn on P Extension codegen(This is a temporary switch where " + "only partial codegen is currently supported)"), + cl::init(false), cl::Hidden); + void RISCVSubtarget::anchor() {} RISCVSubtarget & @@ -104,7 +110,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, RVVVectorBitsMin(RVVVectorBitsMin), RVVVectorBitsMax(RVVVectorBitsMax), FrameLowering( initializeSubtargetDependencies(TT, CPU, TuneCPU, FS, ABIName)), - InstrInfo(*this), RegInfo(getHwMode()), TLInfo(TM, *this) { + InstrInfo(*this), TLInfo(TM, *this) { TSInfo = std::make_unique(); } @@ -145,6 +151,10 @@ bool RISCVSubtarget::useConstantPoolForLargeInts() const { return !RISCVDisableUsingConstantPoolForLargeInts; } +bool RISCVSubtarget::enablePExtCodeGen() const { + return HasStdExtP && EnablePExtCodeGen; +} + unsigned RISCVSubtarget::getMaxBuildIntsCost() const { // Loading integer from constant pool needs two instructions (the reason why // the minimum cost is 2): an address calculation instruction and a load diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 4b4fc8f0d8e76..f05115dbeb8cb 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -112,7 +112,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { RISCVFrameLowering FrameLowering; RISCVInstrInfo InstrInfo; - RISCVRegisterInfo RegInfo; RISCVTargetLowering TLInfo; /// Initializes using the passed in CPU and feature strings so that we can @@ -140,13 +139,14 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { } const RISCVInstrInfo *getInstrInfo() const override { return &InstrInfo; } const RISCVRegisterInfo *getRegisterInfo() const override { - return &RegInfo; + return &InstrInfo.getRegisterInfo(); } const RISCVTargetLowering *getTargetLowering() const override { return &TLInfo; } bool enableMachineScheduler() const override { return true; } + bool enableTerminalRule() const override { return true; } bool enablePostRAScheduler() const override { return UsePostRAScheduler; } @@ -322,6 +322,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { } } + bool enablePExtCodeGen() const; + // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the // vector hardware implementation which may be less than VLEN. unsigned getDLenFactor() const { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 3d8eb4097604a..dca6e9cffebb0 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -969,6 +969,13 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( if (isa(Ty)) return InstructionCost::getInvalid(); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && isa(Ty)) { + return 1; // Treat as single instruction cost for now + } + // A build_vector (which is m1 sized or smaller) can be done in no // worse than one vslide1down.vx per element in the type. We could // in theory do an explode_vector in the inverse manner, but our @@ -1625,6 +1632,14 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (!IsVectorType) return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && + (isa(Dst) || isa(Src))) { + return 1; // Treat as single instruction cost for now + } + // FIXME: Need to compute legalizing cost for illegal types. The current // code handles only legal types and those which can be trivially // promoted to legal. @@ -2323,6 +2338,13 @@ InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, const Value *Op1) const { assert(Val->isVectorTy() && "This must be a vector type"); + // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16) + // For now, skip all fixed vector cost analysis when P extension is available + // to avoid crashes in getMinRVVVectorSizeInBits() + if (ST->enablePExtCodeGen() && isa(Val)) { + return 1; // Treat as single instruction cost for now + } + if (Opcode != Instruction::ExtractElement && Opcode != Instruction::InsertElement) return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index fdf9a4fe32fe6..e1ff243bb1a47 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -455,7 +455,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) { True->getOperand(1).setReg(MI.getOperand(2).getReg()); // If True is masked then its passthru needs to be in VRNoV0. MRI->constrainRegClass(True->getOperand(1).getReg(), - TII->getRegClass(True->getDesc(), 1, TRI)); + TII->getRegClass(True->getDesc(), 1)); } MI.setDesc(TII->get(NewOpc)); @@ -675,7 +675,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) { if (Passthru.getReg().isValid()) MRI->constrainRegClass( Passthru.getReg(), - TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo(), TRI)); + TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo())); } if (RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags)) { diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp index ba95ad822df75..4f8bf4312a380 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp @@ -24,7 +24,7 @@ using namespace llvm; SPIRVInstrInfo::SPIRVInstrInfo(const SPIRVSubtarget &STI) - : SPIRVGenInstrInfo(STI) {} + : SPIRVGenInstrInfo(STI, RI) {} bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const { switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 7c1db3cfcd6b4..ef45d31a029d3 100644 --- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -235,7 +235,7 @@ class SparcOperand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNum; + MCRegister Reg; RegisterKind Kind; }; @@ -244,8 +244,8 @@ class SparcOperand : public MCParsedAsmOperand { }; struct MemOp { - unsigned Base; - unsigned OffsetReg; + MCRegister Base; + MCRegister OffsetReg; const MCExpr *Off; }; @@ -326,7 +326,7 @@ class SparcOperand : public MCParsedAsmOperand { MCRegister getReg() const override { assert((Kind == k_Register) && "Invalid access!"); - return Reg.RegNum; + return Reg.Reg; } const MCExpr *getImm() const { @@ -334,12 +334,12 @@ class SparcOperand : public MCParsedAsmOperand { return Imm.Val; } - unsigned getMemBase() const { + MCRegister getMemBase() const { assert((Kind == k_MemoryReg || Kind == k_MemoryImm) && "Invalid access!"); return Mem.Base; } - unsigned getMemOffsetReg() const { + MCRegister getMemOffsetReg() const { assert((Kind == k_MemoryReg) && "Invalid access!"); return Mem.OffsetReg; } @@ -376,12 +376,16 @@ class SparcOperand : public MCParsedAsmOperand { void print(raw_ostream &OS, const MCAsmInfo &MAI) const override { switch (Kind) { case k_Token: OS << "Token: " << getToken() << "\n"; break; - case k_Register: OS << "Reg: #" << getReg() << "\n"; break; + case k_Register: + OS << "Reg: #" << getReg().id() << "\n"; + break; case k_Immediate: OS << "Imm: " << getImm() << "\n"; break; - case k_MemoryReg: OS << "Mem: " << getMemBase() << "+" - << getMemOffsetReg() << "\n"; break; + case k_MemoryReg: + OS << "Mem: " << getMemBase().id() << "+" << getMemOffsetReg().id() + << "\n"; + break; case k_MemoryImm: assert(getMemOff() != nullptr); - OS << "Mem: " << getMemBase() << "+"; + OS << "Mem: " << getMemBase().id() << "+"; MAI.printExpr(OS, *getMemOff()); OS << "\n"; break; @@ -432,7 +436,7 @@ class SparcOperand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(getMemBase())); - assert(getMemOffsetReg() != 0 && "Invalid offset"); + assert(getMemOffsetReg().isValid() && "Invalid offset"); Inst.addOperand(MCOperand::createReg(getMemOffsetReg())); } @@ -480,10 +484,10 @@ class SparcOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr CreateReg(unsigned RegNum, unsigned Kind, + static std::unique_ptr CreateReg(MCRegister Reg, unsigned Kind, SMLoc S, SMLoc E) { auto Op = std::make_unique(k_Register); - Op->Reg.RegNum = RegNum; + Op->Reg.Reg = Reg; Op->Reg.Kind = (SparcOperand::RegisterKind)Kind; Op->StartLoc = S; Op->EndLoc = E; @@ -540,7 +544,7 @@ class SparcOperand : public MCParsedAsmOperand { regIdx = Reg - Sparc::I0 + 24; if (regIdx % 2 || regIdx > 31) return false; - Op.Reg.RegNum = IntPairRegs[regIdx / 2]; + Op.Reg.Reg = IntPairRegs[regIdx / 2]; Op.Reg.Kind = rk_IntPairReg; return true; } @@ -551,7 +555,7 @@ class SparcOperand : public MCParsedAsmOperand { unsigned regIdx = Reg - Sparc::F0; if (regIdx % 2 || regIdx > 31) return false; - Op.Reg.RegNum = DoubleRegs[regIdx / 2]; + Op.Reg.Reg = DoubleRegs[regIdx / 2]; Op.Reg.Kind = rk_DoubleReg; return true; } @@ -574,7 +578,7 @@ class SparcOperand : public MCParsedAsmOperand { Reg = QuadFPRegs[regIdx / 2]; break; } - Op.Reg.RegNum = Reg; + Op.Reg.Reg = Reg; Op.Reg.Kind = rk_QuadReg; return true; } @@ -587,13 +591,13 @@ class SparcOperand : public MCParsedAsmOperand { regIdx = Reg - Sparc::C0; if (regIdx % 2 || regIdx > 31) return false; - Op.Reg.RegNum = CoprocPairRegs[regIdx / 2]; + Op.Reg.Reg = CoprocPairRegs[regIdx / 2]; Op.Reg.Kind = rk_CoprocPairReg; return true; } static std::unique_ptr - MorphToMEMrr(unsigned Base, std::unique_ptr Op) { + MorphToMEMrr(MCRegister Base, std::unique_ptr Op) { MCRegister offsetReg = Op->getReg(); Op->Kind = k_MemoryReg; Op->Mem.Base = Base; @@ -602,8 +606,8 @@ class SparcOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr - CreateMEMr(unsigned Base, SMLoc S, SMLoc E) { + static std::unique_ptr CreateMEMr(MCRegister Base, SMLoc S, + SMLoc E) { auto Op = std::make_unique(k_MemoryReg); Op->Mem.Base = Base; Op->Mem.OffsetReg = Sparc::G0; // always 0 @@ -614,11 +618,11 @@ class SparcOperand : public MCParsedAsmOperand { } static std::unique_ptr - MorphToMEMri(unsigned Base, std::unique_ptr Op) { + MorphToMEMri(MCRegister Base, std::unique_ptr Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryImm; Op->Mem.Base = Base; - Op->Mem.OffsetReg = 0; + Op->Mem.OffsetReg = MCRegister(); Op->Mem.Off = Imm; return Op; } diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index f66eb9dbee2dc..6596379061e60 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -38,8 +38,8 @@ static cl::opt void SparcInstrInfo::anchor() {} SparcInstrInfo::SparcInstrInfo(const SparcSubtarget &ST) - : SparcGenInstrInfo(ST, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), RI(ST), - Subtarget(ST) {} + : SparcGenInstrInfo(ST, RI, SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP), + RI(ST), Subtarget(ST) {} /// isLoadFromStackSlot - If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of @@ -527,7 +527,6 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -564,10 +563,12 @@ void SparcInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, llvm_unreachable("Can't store this register to stack slot"); } -void SparcInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void SparcInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h index 01d0204734943..273888f427992 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.h +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h @@ -92,14 +92,13 @@ class SparcInstrInfo : public SparcGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; Register getGlobalBaseReg(MachineFunction *MF) const; diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index dcefff99db25b..570bbd884a244 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -360,12 +360,12 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters( if (SystemZ::FP64BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), - &SystemZ::FP64BitRegClass, TRI, Register()); + &SystemZ::FP64BitRegClass, Register()); } if (SystemZ::VR128BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), - &SystemZ::VR128BitRegClass, TRI, Register()); + &SystemZ::VR128BitRegClass, Register()); } } @@ -389,10 +389,10 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters( MCRegister Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), - &SystemZ::FP64BitRegClass, TRI, Register()); + &SystemZ::FP64BitRegClass, Register()); if (SystemZ::VR128BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), - &SystemZ::VR128BitRegClass, TRI, Register()); + &SystemZ::VR128BitRegClass, Register()); } // Restore call-saved GPRs (but not call-clobbered varargs, which at @@ -1157,12 +1157,12 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( if (SystemZ::FP64BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), - &SystemZ::FP64BitRegClass, TRI, Register()); + &SystemZ::FP64BitRegClass, Register()); } if (SystemZ::VR128BitRegClass.contains(Reg)) { MBB.addLiveIn(Reg); TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(), - &SystemZ::VR128BitRegClass, TRI, Register()); + &SystemZ::VR128BitRegClass, Register()); } } @@ -1189,10 +1189,10 @@ bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters( MCRegister Reg = I.getReg(); if (SystemZ::FP64BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), - &SystemZ::FP64BitRegClass, TRI, Register()); + &SystemZ::FP64BitRegClass, Register()); if (SystemZ::VR128BitRegClass.contains(Reg)) TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(), - &SystemZ::VR128BitRegClass, TRI, Register()); + &SystemZ::VR128BitRegClass, Register()); } // Restore call-saved GPRs (but not call-clobbered varargs, which at diff --git a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp index 5313fba3bed1d..8fc339f59e60a 100644 --- a/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ b/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -115,11 +115,10 @@ SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const { } bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const { - const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); const MCInstrDesc &MID = MI->getDesc(); unsigned Count = 0; for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) { - const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI); + const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx); if (RC == nullptr) continue; if (OpIdx >= MID.getNumDefs() && diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 2e21f27c9032f..eb1ce4a2101d7 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -60,7 +60,7 @@ static uint64_t allOnes(unsigned int Count) { void SystemZInstrInfo::anchor() {} SystemZInstrInfo::SystemZInstrInfo(const SystemZSubtarget &sti) - : SystemZGenInstrInfo(sti, -1, -1), + : SystemZGenInstrInfo(sti, RI, -1, -1), RI(sti.getSpecialRegisters()->getReturnFunctionAddressRegister(), sti.getHwMode()), STI(sti) {} @@ -1023,8 +1023,8 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void SystemZInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves @@ -1036,10 +1036,12 @@ void SystemZInstrInfo::storeRegToStackSlot( FrameIdx); } -void SystemZInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h index 7b9ad7b87a14f..4aecdd7498018 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -281,12 +281,14 @@ class SystemZInstrInfo : public SystemZGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override; diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp index 20f561a8dac34..9b47d237f0702 100644 --- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp +++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp @@ -54,7 +54,7 @@ class VEAsmParser : public MCTargetAsmParser { uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; - int parseRegisterName(MCRegister (*matchFn)(StringRef)); + MCRegister parseRegisterName(MCRegister (*matchFn)(StringRef)); ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override; bool parseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -169,7 +169,7 @@ class VEOperand : public MCParsedAsmOperand { }; struct RegOp { - unsigned RegNum; + MCRegister Reg; }; struct ImmOp { @@ -177,8 +177,8 @@ class VEOperand : public MCParsedAsmOperand { }; struct MemOp { - unsigned Base; - unsigned IndexReg; + MCRegister Base; + MCRegister IndexReg; const MCExpr *Index; const MCExpr *Offset; }; @@ -342,7 +342,7 @@ class VEOperand : public MCParsedAsmOperand { MCRegister getReg() const override { assert((Kind == k_Register) && "Invalid access!"); - return Reg.RegNum; + return Reg.Reg; } const MCExpr *getImm() const { @@ -350,14 +350,14 @@ class VEOperand : public MCParsedAsmOperand { return Imm.Val; } - unsigned getMemBase() const { + MCRegister getMemBase() const { assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm || Kind == k_MemoryRegImm) && "Invalid access!"); return Mem.Base; } - unsigned getMemIndexReg() const { + MCRegister getMemIndexReg() const { assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryZeroRegImm) && "Invalid access!"); return Mem.IndexReg; @@ -415,20 +415,21 @@ class VEOperand : public MCParsedAsmOperand { OS << "Token: " << getToken() << "\n"; break; case k_Register: - OS << "Reg: #" << getReg() << "\n"; + OS << "Reg: #" << getReg().id() << "\n"; break; case k_Immediate: OS << "Imm: " << getImm() << "\n"; break; case k_MemoryRegRegImm: assert(getMemOffset() != nullptr); - OS << "Mem: #" << getMemBase() << "+#" << getMemIndexReg() << "+"; + OS << "Mem: #" << getMemBase().id() << "+#" << getMemIndexReg().id() + << "+"; MAI.printExpr(OS, *getMemOffset()); OS << "\n"; break; case k_MemoryRegImmImm: assert(getMemIndex() != nullptr && getMemOffset() != nullptr); - OS << "Mem: #" << getMemBase() << "+"; + OS << "Mem: #" << getMemBase().id() << "+"; MAI.printExpr(OS, *getMemIndex()); OS << "+"; MAI.printExpr(OS, *getMemOffset()); @@ -436,7 +437,7 @@ class VEOperand : public MCParsedAsmOperand { break; case k_MemoryZeroRegImm: assert(getMemOffset() != nullptr); - OS << "Mem: 0+#" << getMemIndexReg() << "+"; + OS << "Mem: 0+#" << getMemIndexReg().id() << "+"; MAI.printExpr(OS, *getMemOffset()); OS << "\n"; break; @@ -450,7 +451,7 @@ class VEOperand : public MCParsedAsmOperand { break; case k_MemoryRegImm: assert(getMemOffset() != nullptr); - OS << "Mem: #" << getMemBase() << "+"; + OS << "Mem: #" << getMemBase().id() << "+"; MAI.printExpr(OS, *getMemOffset()); OS << "\n"; break; @@ -606,10 +607,10 @@ class VEOperand : public MCParsedAsmOperand { return Op; } - static std::unique_ptr CreateReg(unsigned RegNum, SMLoc S, + static std::unique_ptr CreateReg(MCRegister Reg, SMLoc S, SMLoc E) { auto Op = std::make_unique(k_Register); - Op->Reg.RegNum = RegNum; + Op->Reg.Reg = Reg; Op->StartLoc = S; Op->EndLoc = E; return Op; @@ -653,38 +654,38 @@ class VEOperand : public MCParsedAsmOperand { } static bool MorphToI32Reg(VEOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = Reg - VE::SX0; if (regIdx > 63) return false; - Op.Reg.RegNum = I32Regs[regIdx]; + Op.Reg.Reg = I32Regs[regIdx]; return true; } static bool MorphToF32Reg(VEOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = Reg - VE::SX0; if (regIdx > 63) return false; - Op.Reg.RegNum = F32Regs[regIdx]; + Op.Reg.Reg = F32Regs[regIdx]; return true; } static bool MorphToF128Reg(VEOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = Reg - VE::SX0; if (regIdx % 2 || regIdx > 63) return false; - Op.Reg.RegNum = F128Regs[regIdx / 2]; + Op.Reg.Reg = F128Regs[regIdx / 2]; return true; } static bool MorphToVM512Reg(VEOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = Reg - VE::VM0; if (regIdx % 2 || regIdx > 15) return false; - Op.Reg.RegNum = VM512Regs[regIdx / 2]; + Op.Reg.Reg = VM512Regs[regIdx / 2]; return true; } @@ -696,16 +697,16 @@ class VEOperand : public MCParsedAsmOperand { if (regIdx > 31 || MISCRegs[regIdx] == VE::NoRegister) return false; Op.Kind = k_Register; - Op.Reg.RegNum = MISCRegs[regIdx]; + Op.Reg.Reg = MISCRegs[regIdx]; return true; } static std::unique_ptr - MorphToMEMri(unsigned Base, std::unique_ptr Op) { + MorphToMEMri(MCRegister Base, std::unique_ptr Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryRegImm; Op->Mem.Base = Base; - Op->Mem.IndexReg = 0; + Op->Mem.IndexReg = MCRegister(); Op->Mem.Index = nullptr; Op->Mem.Offset = Imm; return Op; @@ -715,15 +716,16 @@ class VEOperand : public MCParsedAsmOperand { MorphToMEMzi(std::unique_ptr Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryZeroImm; - Op->Mem.Base = 0; - Op->Mem.IndexReg = 0; + Op->Mem.Base = MCRegister(); + Op->Mem.IndexReg = MCRegister(); Op->Mem.Index = nullptr; Op->Mem.Offset = Imm; return Op; } static std::unique_ptr - MorphToMEMrri(unsigned Base, unsigned Index, std::unique_ptr Op) { + MorphToMEMrri(MCRegister Base, MCRegister Index, + std::unique_ptr Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryRegRegImm; Op->Mem.Base = Base; @@ -734,22 +736,22 @@ class VEOperand : public MCParsedAsmOperand { } static std::unique_ptr - MorphToMEMrii(unsigned Base, const MCExpr *Index, + MorphToMEMrii(MCRegister Base, const MCExpr *Index, std::unique_ptr Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryRegImmImm; Op->Mem.Base = Base; - Op->Mem.IndexReg = 0; + Op->Mem.IndexReg = MCRegister(); Op->Mem.Index = Index; Op->Mem.Offset = Imm; return Op; } static std::unique_ptr - MorphToMEMzri(unsigned Index, std::unique_ptr Op) { + MorphToMEMzri(MCRegister Index, std::unique_ptr Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryZeroRegImm; - Op->Mem.Base = 0; + Op->Mem.Base = MCRegister(); Op->Mem.IndexReg = Index; Op->Mem.Index = nullptr; Op->Mem.Offset = Imm; @@ -760,8 +762,8 @@ class VEOperand : public MCParsedAsmOperand { MorphToMEMzii(const MCExpr *Index, std::unique_ptr Op) { const MCExpr *Imm = Op->getImm(); Op->Kind = k_MemoryZeroImmImm; - Op->Mem.Base = 0; - Op->Mem.IndexReg = 0; + Op->Mem.Base = MCRegister(); + Op->Mem.IndexReg = MCRegister(); Op->Mem.Index = Index; Op->Mem.Offset = Imm; return Op; @@ -815,14 +817,14 @@ bool VEAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc, /// Parses a register name using a given matching function. /// Checks for lowercase or uppercase if necessary. -int VEAsmParser::parseRegisterName(MCRegister (*matchFn)(StringRef)) { +MCRegister VEAsmParser::parseRegisterName(MCRegister (*matchFn)(StringRef)) { StringRef Name = Parser.getTok().getString(); - int RegNum = matchFn(Name); + MCRegister RegNum = matchFn(Name); // GCC supports case insensitive register names. All of the VE registers // are all lower case. - if (RegNum == VE::NoRegister) { + if (!RegNum) { RegNum = matchFn(Name.lower()); } diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index d5e804afd27fe..b9ac5d6254362 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -35,7 +35,7 @@ using namespace llvm; void VEInstrInfo::anchor() {} VEInstrInfo::VEInstrInfo(const VESubtarget &ST) - : VEGenInstrInfo(ST, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} + : VEGenInstrInfo(ST, RI, VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); } @@ -459,7 +459,6 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; @@ -519,10 +518,12 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, report_fatal_error("Can't store this register to stack slot"); } -void VEInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, - int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h index 408d3ab9e05f5..cedf7f21011ff 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.h +++ b/llvm/lib/Target/VE/VEInstrInfo.h @@ -92,13 +92,15 @@ class VEInstrInfo : public VEGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; /// } Stack Spill & Reload diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index 343d90e88950f..8b4e4fbbbd1e5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -34,7 +34,7 @@ using namespace llvm; #include "WebAssemblyGenInstrInfo.inc" WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) - : WebAssemblyGenInstrInfo(STI, WebAssembly::ADJCALLSTACKDOWN, + : WebAssemblyGenInstrInfo(STI, RI, WebAssembly::ADJCALLSTACKDOWN, WebAssembly::ADJCALLSTACKUP, WebAssembly::CATCHRET), RI(STI.getTargetTriple()) {} diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index d2e35277419f7..9473e8db3af93 100644 --- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -387,8 +387,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, MachineMemOperand *LMMO = *LoadInst->memoperands_begin(); MachineMemOperand *SMMO = *StoreInst->memoperands_begin(); - Register Reg1 = MRI->createVirtualRegister( - TII->getRegClass(TII->get(NLoadOpcode), 0, TRI)); + Register Reg1 = + MRI->createVirtualRegister(TII->getRegClass(TII->get(NLoadOpcode), 0)); MachineInstr *NewLoad = BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode), Reg1) @@ -553,7 +553,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { } unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { - const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI); + const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0); return TRI->getRegSizeInBits(*TRC) / 8; } diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index 5d190114615de..2047a53199dd6 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -174,8 +174,8 @@ class InstrReplacerDstCOPY : public InstrConverterBase { MachineBasicBlock *MBB = MI->getParent(); const DebugLoc &DL = MI->getDebugLoc(); - Register Reg = MRI->createVirtualRegister( - TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo())); + Register Reg = + MRI->createVirtualRegister(TII->getRegClass(TII->get(DstOpcode), 0)); MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) Bld.add(MO); diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index 06f729a7e0cdc..25799f4ac0ea0 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -206,8 +206,7 @@ void X86FastPreTileConfig::spill(MachineBasicBlock::iterator Before, const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); // Don't need shape information for tile store, becasue it is adjacent to // the tile def instruction. - TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, TRI, - Register()); + TII->storeRegToStackSlot(*MBB, Before, VirtReg, Kill, FI, &RC, Register()); ++NumStores; // TODO: update DBG_VALUEs diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a66a3213403b4..8bca6344d6521 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -3093,8 +3093,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters( MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI, - Register(), MachineInstr::FrameSetup); + TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, Register(), + MachineInstr::FrameSetup); } return true; @@ -3166,8 +3166,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters( VT = STI.hasBWI() ? MVT::v64i1 : MVT::v16i1; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, TRI, - Register()); + TII.loadRegFromStackSlot(MBB, MI, Reg, I.getFrameIdx(), RC, Register()); } // Clear the stack slot for spill base pointer register. diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b2a7a4ec3583..61d9608160197 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -85,7 +85,7 @@ static cl::opt UndefRegClearance( void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) - : X86GenInstrInfo(STI, + : X86GenInstrInfo(STI, RI, (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 @@ -93,10 +93,9 @@ X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)), Subtarget(STI), RI(STI.getTargetTriple()) {} -const TargetRegisterClass * -X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI); +const TargetRegisterClass *X86InstrInfo::getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const { + auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum); // If the target does not have egpr, then r16-r31 will be resereved for all // instructions. if (!RC || !Subtarget.hasEGPR()) @@ -958,8 +957,7 @@ bool X86InstrInfo::isReMaterializableImpl( void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const { + const MachineInstr &Orig) const { bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) != MachineBasicBlock::LQR_Dead) { @@ -4782,14 +4780,14 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB, void X86InstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) && "Stack slot too small for store"); - unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); + unsigned Alignment = std::max(RI.getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx)); @@ -4803,15 +4801,17 @@ void X86InstrInfo::storeRegToStackSlot( .setMIFlag(Flags); } -void X86InstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && + assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) && "Load size exceeds stack slot"); - unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); + unsigned Alignment = std::max(RI.getSpillSize(*RC), 16); bool isAligned = (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx)); @@ -5553,7 +5553,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; ShouldUpdateCC = true; } else if (ImmDelta != 0) { - unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg)); + unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg)); // Shift amount for min/max constants to adjust for 8/16/32 instruction // sizes. switch (OldCC) { @@ -7235,7 +7235,6 @@ static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); for (int Idx : llvm::seq(0, NewMI.getNumOperands())) { MachineOperand &MO = NewMI.getOperand(Idx); @@ -7247,7 +7246,7 @@ static void updateOperandRegConstraints(MachineFunction &MF, continue; auto *NewRC = - MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI)); + MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx)); if (!NewRC) { LLVM_DEBUG( dbgs() << "WARNING: Unable to update register constraint for operand " @@ -7345,7 +7344,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( unsigned SrcIdx = (Imm >> 6) & 3; const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) { @@ -7370,7 +7369,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. if (OpNum == 2) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) { unsigned NewOpCode = @@ -7389,7 +7388,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( // table twice. if (OpNum == 2) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) { MachineInstr *NewMI = @@ -7524,7 +7523,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( bool NarrowToMOV32rm = false; if (Size) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI); + const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -8118,9 +8117,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass; }; - if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI))) + if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1))) MaskReg = Op1.getReg(); - else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI))) + else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2))) MaskReg = Op2.getReg(); if (MaskReg) { @@ -8524,7 +8523,7 @@ bool X86InstrInfo::unfoldMemoryOperand( const MCInstrDesc &MCID = get(Opc); - const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + const TargetRegisterClass *RC = getRegClass(MCID, Index); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); // TODO: Check if 32-byte or greater accesses are slow too? if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass && @@ -8635,7 +8634,7 @@ bool X86InstrInfo::unfoldMemoryOperand( // Emit the store instruction. if (UnfoldStore) { - const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI); + const TargetRegisterClass *DstRC = getRegClass(MCID, 0); auto MMOs = extractStoreMMOs(MI.memoperands(), MF); unsigned Alignment = std::max(TRI.getSpillSize(*DstRC), 16); bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; @@ -8667,7 +8666,7 @@ bool X86InstrInfo::unfoldMemoryOperand( const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + const TargetRegisterClass *RC = getRegClass(MCID, Index); unsigned NumDefs = MCID.NumDefs; std::vector AddrOps; std::vector BeforeOps; @@ -8718,7 +8717,7 @@ bool X86InstrInfo::unfoldMemoryOperand( std::vector VTs; const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { - DstRC = getRegClass(MCID, 0, &RI); + DstRC = getRegClass(MCID, 0); VTs.push_back(*TRI.legalclasstypes_begin(*DstRC)); } for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5f75559bd9598..a547fcd421411 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -246,9 +246,8 @@ class X86InstrInfo final : public X86GenInstrInfo { /// GR*RegClass (definition in TD file) /// -> /// GR*_NOREX2RegClass (Returned register class) - const TargetRegisterClass * - getRegClass(const MCInstrDesc &MCID, unsigned OpNum, - const TargetRegisterInfo *TRI) const override; + const TargetRegisterClass *getRegClass(const MCInstrDesc &MCID, + unsigned OpNum) const override; /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should @@ -343,8 +342,7 @@ class X86InstrInfo final : public X86GenInstrInfo { bool isReMaterializableImpl(const MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, - const MachineInstr &Orig, - const TargetRegisterInfo &TRI) const override; + const MachineInstr &Orig) const override; /// Given an operand within a MachineInstr, insert preceding code to put it /// into the right format for a particular kind of LEA instruction. This may @@ -469,14 +467,14 @@ class X86InstrInfo final : public X86GenInstrInfo { bool RenamableSrc = false) const override; void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index 167bed132cd12..c9646053afac1 100644 --- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -359,7 +359,7 @@ bool X86OptimizeLEAPass::chooseBestLEA( // example MOV8mr_NOREX. We could constrain the register class of the LEA // def to suit MI, however since this case is very rare and hard to // reproduce in a test it's just more reliable to skip the LEA. - if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI) != + if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg) != MRI->getRegClass(DefMI->getOperand(0).getReg())) continue; diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index e0b3b61e29175..d0d897e6784d3 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -841,7 +841,7 @@ getRegClassForUnfoldedLoad(const X86InstrInfo &TII, unsigned Opcode) { unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold( Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index); const MCInstrDesc &MCID = TII.get(UnfoldedOpc); - return TII.getRegClass(MCID, Index, &TII.getRegisterInfo()); + return TII.getRegClass(MCID, Index); } void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads( diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 868f41375b96b..4f5aadca361fe 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -419,6 +419,8 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } + bool enableTerminalRule() const override { return true; } + bool enableEarlyIfConversion() const override; void getPostRAMutations(std::vector> diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp index 096ad08d8a3c9..0e00db495256c 100644 --- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp +++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp @@ -69,7 +69,7 @@ static bool readInstruction32(ArrayRef Bytes, uint64_t Address, return true; } -static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { +static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo) { const MCRegisterInfo *RegInfo = D->getContext().getRegisterInfo(); return RegInfo->getRegClass(RC).getRegister(RegNo); } @@ -79,7 +79,7 @@ static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst, unsigned RegNo, const MCDisassembler *Decoder) { if (RegNo > 11) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } @@ -89,7 +89,7 @@ static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst, unsigned RegNo, const MCDisassembler *Decoder) { if (RegNo > 15) return MCDisassembler::Fail; - unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo); + MCRegister Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo); Inst.addOperand(MCOperand::createReg(Reg)); return MCDisassembler::Success; } diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index cdb5186d23d3c..351a221c92ebd 100644 --- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -432,7 +432,7 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters( // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI, + TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, Register()); if (emitFrameMoves) { auto Store = MI; @@ -458,8 +458,7 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters( "LR & FP are always handled in emitEpilogue"); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, TRI, - Register()); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, Register()); assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!"); // Insert in reverse order. loadRegFromStackSlot can insert multiple diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index 1a9133aad4dd3..075910c84fb84 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -43,7 +43,7 @@ namespace XCore { void XCoreInstrInfo::anchor() {} XCoreInstrInfo::XCoreInstrInfo(const XCoreSubtarget &ST) - : XCoreGenInstrInfo(ST, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), + : XCoreGenInstrInfo(ST, RI, XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP), RI() {} static bool isZeroImm(const MachineOperand &op) { @@ -355,8 +355,8 @@ void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void XCoreInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; if (I != MBB.end() && !I->isDebugInstr()) DL = I->getDebugLoc(); @@ -377,7 +377,6 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL; diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h index 3543392653786..c4e399ebd3fd8 100644 --- a/llvm/lib/Target/XCore/XCoreInstrInfo.h +++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h @@ -71,13 +71,15 @@ class XCoreInstrInfo : public XCoreGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; bool reverseBranchCondition( diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp index cf9a2a052978d..1c0dc66a46144 100644 --- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp @@ -314,7 +314,7 @@ bool XtensaFrameLowering::spillCalleeSavedRegisters( bool IsKill = !IsA0AndRetAddrIsTaken; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(EntryBlock, MI, Reg, IsKill, CSI[i].getFrameIdx(), - RC, TRI, Register()); + RC, Register()); } return true; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index be69cefb5b78f..d7b05acea9411 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -48,7 +48,8 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) { } XtensaInstrInfo::XtensaInstrInfo(const XtensaSubtarget &STI) - : XtensaGenInstrInfo(STI, Xtensa::ADJCALLSTACKDOWN, Xtensa::ADJCALLSTACKUP), + : XtensaGenInstrInfo(STI, RI, Xtensa::ADJCALLSTACKDOWN, + Xtensa::ADJCALLSTACKUP), RI(STI), STI(STI) {} Register XtensaInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, @@ -144,8 +145,8 @@ void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB, void XtensaInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, - MachineInstr::MIFlag Flags) const { + + Register VReg, MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); unsigned LoadOpcode, StoreOpcode; getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx); @@ -154,10 +155,12 @@ void XtensaInstrInfo::storeRegToStackSlot( addFrameReference(MIB, FrameIdx); } -void XtensaInstrInfo::loadRegFromStackSlot( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, - int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, - Register VReg, MachineInstr::MIFlag Flags) const { +void XtensaInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + Register DestReg, int FrameIdx, + const TargetRegisterClass *RC, + Register VReg, + MachineInstr::MIFlag Flags) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); unsigned LoadOpcode, StoreOpcode; getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode, FrameIdx); @@ -543,12 +546,12 @@ void XtensaInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, "function code size is significantly larger than estimated"); storeRegToStackSlot(MBB, L32R, ScavRegister, /*IsKill=*/true, FrameIndex, - &Xtensa::ARRegClass, &RI, Register()); + &Xtensa::ARRegClass, Register()); RI.eliminateFrameIndex(std::prev(L32R.getIterator()), /*SpAdj=*/0, /*FIOperandNum=*/1); loadRegFromStackSlot(RestoreBB, RestoreBB.end(), ScavRegister, FrameIndex, - &Xtensa::ARRegClass, &RI, Register()); + &Xtensa::ARRegClass, Register()); RI.eliminateFrameIndex(RestoreBB.back(), /*SpAdj=*/0, /*FIOperandNum=*/1); JumpToMBB = &RestoreBB; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h index 1808cb36d8a9b..0b46d6ce2fdb7 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h @@ -56,14 +56,13 @@ class XtensaInstrInfo : public XtensaGenInstrInfo { void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, - bool isKill, int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; void loadRegFromStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, int FrameIdx, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI, Register VReg, + Register VReg, MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override; // Get the load and store opcodes for a given register class and offset. diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 4ba4ba3850e58..eab1d4975ac96 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -196,6 +196,18 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { return true; } +// Ensure we stay within the bounds of fp values that can be represented as +// integers without gaps, which are 2^24 and 2^53 for IEEE-754 single and double +// precision respectively (both on negative and positive side). +static bool isRepresentableAsExactInteger(ConstantFP *FPVal, int64_t IntVal) { + const auto &InitValueFltSema = FPVal->getValueAPF().getSemantics(); + if (!APFloat::isIEEELikeFP(InitValueFltSema)) + return false; + + return isUIntN(APFloat::semanticsPrecision(InitValueFltSema), + AbsoluteValue(IntVal)); +} + /// If the loop has floating induction variable then insert corresponding /// integer induction variable if possible. /// For example, @@ -212,7 +224,8 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { auto *InitValueVal = dyn_cast(PN->getIncomingValue(IncomingEdge)); int64_t InitValue; - if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue)) + if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue) || + !isRepresentableAsExactInteger(InitValueVal, InitValue)) return false; // Check IV increment. Reject this PN if increment operation is not @@ -262,7 +275,8 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { ConstantFP *ExitValueVal = dyn_cast(Compare->getOperand(1)); int64_t ExitValue; if (ExitValueVal == nullptr || - !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue)) + !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue) || + !isRepresentableAsExactInteger(ExitValueVal, ExitValue)) return false; // Find new predicate for integer comparison. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 566d6eafee63e..345bc63081b81 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -249,6 +249,11 @@ static cl::opt ForceTailFoldingStyle( "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask."))); +cl::opt llvm::EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide lane masks when used for control flow in " + "tail-folded loops")); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -1314,6 +1319,12 @@ class LoopVectorizationCostModel { return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } + /// Returns true if tail-folding is preferred over a scalar epilogue. + bool preferPredicatedLoop() const { + return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate; + } + /// Returns the TailFoldingStyle that is best for the current loop. TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { if (!ChosenTailFoldingStyle) @@ -1374,6 +1385,17 @@ class LoopVectorizationCostModel { return getTailFoldingStyle() != TailFoldingStyle::None; } + /// Returns true if the use of wide lane masks is requested and the loop is + /// using tail-folding with a lane mask for control flow. + bool useWideActiveLaneMask() const { + if (!EnableWideActiveLaneMask) + return false; + + TailFoldingStyle TF = getTailFoldingStyle(); + return TF == TailFoldingStyle::DataAndControlFlow || + TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + } + /// Return maximum safe number of elements to be processed per vector /// iteration, which do not prevent store-load forwarding and are safe with /// regard to the memory dependencies. Required for EVL-based VPlans to @@ -4560,7 +4582,12 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!CM.isScalarEpilogueAllowed()) + // Only interleave tail-folded loops if wide lane masks are requested, as the + // overhead of multiple instructions to calculate the predicate is likely + // not beneficial. If a scalar epilogue is not allowed for any other reason, + // do not interleave. + if (!CM.isScalarEpilogueAllowed() && + !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask())) return 1; if (any_of(Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), @@ -8226,15 +8253,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction, } VPValue *Cond = nullptr; - if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent())) { - assert((ReductionOpcode == Instruction::Add || - ReductionOpcode == Instruction::Sub) && - "Expected an ADD or SUB operation for predicated partial " - "reductions (because the neutral element in the mask is zero)!"); + if (CM.blockNeedsPredicationForAnyReason(ReductionI->getParent())) Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = Plan.getConstantInt(ReductionI->getType(), 0); - BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); - } return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, ScaleFactor, ReductionI); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 5e4303a4c5fff..90696ffc3aca7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -99,20 +99,20 @@ VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) VPValue::~VPValue() { assert(Users.empty() && "trying to delete a VPValue with remaining users"); - if (Def) + if (VPDef *Def = getDefiningRecipe()) Def->removeDefinedValue(this); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { - if (const VPRecipeBase *R = dyn_cast_or_null(Def)) + if (const VPRecipeBase *R = getDefiningRecipe()) R->print(OS, "", SlotTracker); else printAsOperand(OS, SlotTracker); } void VPValue::dump() const { - const VPRecipeBase *Instr = dyn_cast_or_null(this->Def); + const VPRecipeBase *Instr = getDefiningRecipe(); VPSlotTracker SlotTracker( (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr); print(dbgs(), SlotTracker); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 72858e1265d86..3840b464e6c2c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1787,12 +1787,6 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags { return getOperand(I + 1)->isDefinedOutsideLoopRegions(); } - bool areAllOperandsInvariant() const { - return all_of(operands(), [](VPValue *Op) { - return Op->isDefinedOutsideLoopRegions(); - }); - } - public: VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef Operands) : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP), diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 707886f873fba..ba145ffa0b681 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -311,18 +311,27 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, std::optional Opcode; VPValue *Op = getVecOp(); uint64_t MulConst; + + InstructionCost CondCost = 0; + if (isConditional()) { + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + auto *VecTy = Ctx.Types.inferScalarType(Op); + auto *CondTy = Ctx.Types.inferScalarType(getCondOp()); + CondCost = Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VecTy, CondTy, + Pred, Ctx.CostKind); + } + // If the partial reduction is predicated, a select will be operand 1. // If it isn't predicated and the mul isn't operating on a constant, then it // should have been turned into a VPExpressionRecipe. // FIXME: Replace the entire function with this once all partial reduction // variants are bundled into VPExpressionRecipe. - if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) && - !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { + if (!match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) { auto *PhiType = Ctx.Types.inferScalarType(getChainOp()); auto *InputType = Ctx.Types.inferScalarType(getVecOp()); - return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType, - PhiType, VF, TTI::PR_None, - TTI::PR_None, {}, Ctx.CostKind); + return CondCost + Ctx.TTI.getPartialReductionCost( + getOpcode(), InputType, InputType, PhiType, VF, + TTI::PR_None, TTI::PR_None, {}, Ctx.CostKind); } VPRecipeBase *OpR = Op->getDefiningRecipe(); @@ -381,12 +390,13 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, } else if (auto Widen = dyn_cast(OpR)) { HandleWiden(Widen); } else if (auto Reduction = dyn_cast(OpR)) { - return Reduction->computeCost(VF, Ctx); + return CondCost + Reduction->computeCost(VF, Ctx); } auto *PhiType = Ctx.Types.inferScalarType(getOperand(1)); - return Ctx.TTI.getPartialReductionCost(getOpcode(), InputTypeA, InputTypeB, - PhiType, VF, ExtAType, ExtBType, - Opcode, Ctx.CostKind); + return CondCost + Ctx.TTI.getPartialReductionCost( + getOpcode(), InputTypeA, InputTypeB, PhiType, VF, + ExtAType, ExtBType, Opcode, Ctx.CostKind); + ; } void VPPartialReductionRecipe::execute(VPTransformState &State) { @@ -395,12 +405,18 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) { assert(getOpcode() == Instruction::Add && "Unhandled partial reduction opcode"); - Value *BinOpVal = State.get(getOperand(1)); - Value *PhiVal = State.get(getOperand(0)); + Value *BinOpVal = State.get(getVecOp()); + Value *PhiVal = State.get(getChainOp()); assert(PhiVal && BinOpVal && "Phi and Mul must be set"); Type *RetTy = PhiVal->getType(); + if (isConditional()) { + Value *Cond = State.get(getCondOp()); + Value *Zero = ConstantInt::get(BinOpVal->getType(), 0); + BinOpVal = Builder.CreateSelect(Cond, BinOpVal, Zero); + } + CallInst *V = Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add, {PhiVal, BinOpVal}, nullptr, "partial.reduce"); @@ -2507,51 +2523,32 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { // is vector-typed. Thus, to keep the representation compact, we only use // vector-typed operands for loop-varying values. - if (areAllOperandsInvariant()) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - SmallVector Ops; - for (unsigned I = 0, E = getNumOperands(); I != E; I++) - Ops.push_back(State.get(getOperand(I), VPLane(0))); - - auto *NewGEP = - State.Builder.CreateGEP(getSourceElementType(), Ops[0], drop_begin(Ops), - "", getGEPNoWrapFlags()); - Value *Splat = State.Builder.CreateVectorSplat(State.VF, NewGEP); - State.set(this, Splat); - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers unless VF is scalar. - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant()); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector Indices; - for (unsigned I = 1, E = getNumOperands(); I < E; I++) { - VPValue *Operand = getOperand(I); - Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1))); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices, - "", getGEPNoWrapFlags()); - assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - State.set(this, NewGEP); - } + assert( + any_of(operands(), + [](VPValue *Op) { return !Op->isDefinedOutsideLoopRegions(); }) && + "Expected at least one loop-variant operand"); + + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers unless VF is scalar. + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = State.get(getOperand(0), isPointerLoopInvariant()); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector Indices; + for (unsigned I = 1, E = getNumOperands(); I < E; I++) { + VPValue *Operand = getOperand(I); + Indices.push_back(State.get(Operand, isIndexLoopInvariant(I - 1))); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = State.Builder.CreateGEP(getSourceElementType(), Ptr, Indices, + "", getGEPNoWrapFlags()); + assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + State.set(this, NewGEP); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index f5bef08fafcdc..10afd006c90c9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -40,10 +40,6 @@ using namespace llvm; using namespace VPlanPatternMatch; -static cl::opt EnableWideActiveLaneMask( - "enable-wide-lane-mask", cl::init(false), cl::Hidden, - cl::desc("Enable use of wide get active lane mask instructions")); - bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlan &Plan, function_ref @@ -1391,7 +1387,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { - if (!isa(&R)) + if (!isa(&R)) continue; auto *RepR = dyn_cast(&R); if (RepR && (RepR->isSingleScalar() || RepR->isPredicated())) @@ -4147,13 +4144,13 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) { /// is defined at \p Idx of a load interleave group. static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx) { - auto *DefR = OpV->getDefiningRecipe(); - if (!DefR) - return WideMember0->getOperand(OpIdx) == OpV; - if (auto *W = dyn_cast(DefR)) - return !W->getMask() && WideMember0->getOperand(OpIdx) == OpV; - - if (auto *IR = dyn_cast(DefR)) + VPValue *Member0Op = WideMember0->getOperand(OpIdx); + VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe(); + if (!Member0OpR) + return Member0Op == OpV; + if (auto *W = dyn_cast(Member0OpR)) + return !W->getMask() && Member0Op == OpV; + if (auto *IR = dyn_cast(Member0OpR)) return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV; return false; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index b28559b620e13..34850743e7b62 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -32,6 +32,7 @@ class VPRecipeBuilder; struct VFRange; extern cl::opt VerifyEachVPlan; +extern cl::opt EnableWideActiveLaneMask; struct VPlanTransforms { /// Helper to run a VPlan transform \p Transform on \p VPlan, forwarding extra diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll index ec848c2c08305..d9e26dc47b53f 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -enable-no-nans-fp-math -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s +; RUN: opt < %s -enable-no-nans-fp-math -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+sve | FileCheck %s --check-prefixes=CHECK,CHECK-BASE +; RUN: opt < %s -enable-no-nans-fp-math -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16,+sve-b16b16,+sve | FileCheck %s --check-prefixes=CHECK,CHECK-BF16 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -31,6 +32,21 @@ define void @fadd() { ret void } +define void @fadd_bf16() { +; CHECK-LABEL: 'fadd_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = fadd poison, poison + %NXV8BF16 = fadd poison, poison + %NXV16BF16 = fadd poison, poison + + ret void +} + + define void @fsub() { ; CHECK-LABEL: 'fsub' ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub poison, poison @@ -59,6 +75,20 @@ define void @fsub() { ret void } +define void @fsub_bf16() { +; CHECK-LABEL: 'fsub_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = fsub poison, poison + %NXV8BF16 = fsub poison, poison + %NXV16BF16 = fsub poison, poison + + ret void +} + define void @fneg() { ; CHECK-LABEL: 'fneg' ; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg poison @@ -87,6 +117,22 @@ define void @fneg() { ret void } +define void @fneg_bf16() { +; CHECK-LABEL: 'fneg_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV2BF16 = fneg poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fneg poison +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fneg poison +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fneg poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV2BF16 = fneg poison + %NXV4BF16 = fneg poison + %NXV8BF16 = fneg poison + %NXV16BF16 = fneg poison + + ret void +} + define void @fmul() { ; CHECK-LABEL: 'fmul' ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul poison, poison @@ -113,6 +159,20 @@ define void @fmul() { ret void } +define void @fmul_bf16() { +; CHECK-LABEL: 'fmul_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:29 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:58 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = fmul poison, poison + %NXV8BF16 = fmul poison, poison + %NXV16BF16 = fmul poison, poison + + ret void +} + define void @fdiv() { ; CHECK-LABEL: 'fdiv' ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv poison, poison @@ -139,6 +199,20 @@ define void @fdiv() { ret void } +define void @fdiv_bf16() { +; CHECK-LABEL: 'fdiv_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NXV4BF16 = fdiv poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:29 CodeSize:4 Lat:4 SizeLat:4 for: %NXV8BF16 = fdiv poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:58 CodeSize:4 Lat:4 SizeLat:4 for: %NXV16BF16 = fdiv poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = fdiv poison, poison + %NXV8BF16 = fdiv poison, poison + %NXV16BF16 = fdiv poison, poison + + ret void +} + define void @frem() { ; CHECK-LABEL: 'frem' ; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem poison, poison @@ -165,6 +239,20 @@ define void @frem() { ret void } +define void @frem_bf16() { +; CHECK-LABEL: 'frem_bf16' +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV4BF16 = frem poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV8BF16 = frem poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NXV16BF16 = frem poison, poison +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = frem poison, poison + %NXV8BF16 = frem poison, poison + %NXV16BF16 = frem poison, poison + + ret void +} + define void @fma() { ; CHECK-LABEL: 'fma' ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call @llvm.fma.nxv4f16( poison, poison, poison) @@ -191,6 +279,26 @@ define void @fma() { ret void } +define void @fma_bf16() { +; CHECK-BASE-LABEL: 'fma_bf16' +; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %NXV4BF16 = call @llvm.fma.nxv4bf16( poison, poison, poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %NXV8BF16 = call @llvm.fma.nxv8bf16( poison, poison, poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of 4 for: %NXV16BF16 = call @llvm.fma.nxv16bf16( poison, poison, poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; +; CHECK-BF16-LABEL: 'fma_bf16' +; CHECK-BF16-NEXT: Cost Model: Found costs of 2 for: %NXV4BF16 = call @llvm.fma.nxv4bf16( poison, poison, poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of 2 for: %NXV8BF16 = call @llvm.fma.nxv8bf16( poison, poison, poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of 4 for: %NXV16BF16 = call @llvm.fma.nxv16bf16( poison, poison, poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = call @llvm.fma.nxv4bf16( poison, poison, poison) + %NXV8BF16 = call @llvm.fma.nxv8bf16( poison, poison, poison) + %NXV16BF16 = call @llvm.fma.nxv16bf16( poison, poison, poison) + + ret void +} + define void @fmuladd() { ; CHECK-LABEL: 'fmuladd' ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call @llvm.fmuladd.nxv4f16( poison, poison, poison) @@ -216,3 +324,23 @@ define void @fmuladd() { ret void } + +define void @fmuladd_bf16() { +; CHECK-BASE-LABEL: 'fmuladd_bf16' +; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %NXV4BF16 = call @llvm.fmuladd.nxv4bf16( poison, poison, poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of 1 for: %NXV8BF16 = call @llvm.fmuladd.nxv8bf16( poison, poison, poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of 4 for: %NXV16BF16 = call @llvm.fmuladd.nxv16bf16( poison, poison, poison) +; CHECK-BASE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; +; CHECK-BF16-LABEL: 'fmuladd_bf16' +; CHECK-BF16-NEXT: Cost Model: Found costs of 2 for: %NXV4BF16 = call @llvm.fmuladd.nxv4bf16( poison, poison, poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of 2 for: %NXV8BF16 = call @llvm.fmuladd.nxv8bf16( poison, poison, poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of 4 for: %NXV16BF16 = call @llvm.fmuladd.nxv16bf16( poison, poison, poison) +; CHECK-BF16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; + %NXV4BF16 = call @llvm.fmuladd.nxv4bf16( poison, poison, poison) + %NXV8BF16 = call @llvm.fmuladd.nxv8bf16( poison, poison, poison) + %NXV16BF16 = call @llvm.fmuladd.nxv16bf16( poison, poison, poison) + + ret void +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll index a84d666c1be6b..d1bcad4724e48 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators.ll @@ -24,8 +24,8 @@ loop: %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1 - %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1 + %a = load <8 x i8>, ptr %ptr1_i, align 1 + %b = load <8 x i8>, ptr %ptr2_i, align 1 %vabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) %vabd_ext = zext <8 x i8> %vabd to <8 x i16> %acc_next = add <8 x i16> %vabd_ext, %acc_phi @@ -65,8 +65,8 @@ loop: %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1 - %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1 + %a = load <4 x i16>, ptr %ptr1_i, align 1 + %b = load <4 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b) %vmov = zext <4 x i16> %vabd to <4 x i32> %acc_next = add <4 x i32> %vmov, %acc_phi @@ -116,8 +116,8 @@ loop: %acc_phi_lo = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next_lo, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1 - %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1 + %a = load <16 x i8>, ptr %ptr1_i, align 1 + %b = load <16 x i8>, ptr %ptr2_i, align 1 %a_hi = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> %b_hi = shufflevector <16 x i8> %b, <16 x i8> zeroinitializer, <8 x i32> %a_lo = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <8 x i32> @@ -160,8 +160,8 @@ loop: %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i - %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1 - %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1 + %a = load <4 x i32>, ptr %ptr1_i, align 1 + %b = load <4 x i32>, ptr %ptr2_i, align 1 %vabd = tail call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %a, <4 x i32> %b) %acc_next = add <4 x i32> %acc_phi, %vabd %next_i = add i32 %i, 4 @@ -198,8 +198,8 @@ loop: ; Load values from ptr1 and ptr2 %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i - %a = load <4 x i32>, <4 x i32>* %ptr1_i, align 1 - %b = load <4 x i32>, <4 x i32>* %ptr2_i, align 1 + %a = load <4 x i32>, ptr %ptr1_i, align 1 + %b = load <4 x i32>, ptr %ptr2_i, align 1 ; Perform the intrinsic operation %vabd = tail call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %a, <4 x i32> %b) %acc_next = add <4 x i32> %acc_phi, %vabd @@ -237,8 +237,8 @@ loop: %acc_phi = phi <2 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i32, ptr %ptr1, i32 %i %ptr2_i = getelementptr i32, ptr %ptr2, i32 %i - %a = load <2 x i32>, <2 x i32>* %ptr1_i, align 1 - %b = load <2 x i32>, <2 x i32>* %ptr2_i, align 1 + %a = load <2 x i32>, ptr %ptr1_i, align 1 + %b = load <2 x i32>, ptr %ptr2_i, align 1 %vabd = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b) %acc_next = add <2 x i32> %acc_phi, %vabd %next_i = add i32 %i, 2 @@ -272,8 +272,8 @@ loop: %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1 - %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1 + %a = load <8 x i8>, ptr %ptr1_i, align 1 + %b = load <8 x i8>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) %acc_next = add <8 x i8> %acc_phi, %vabd %next_i = add i32 %i, 8 @@ -307,8 +307,8 @@ loop: %acc_phi = phi <16 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <16 x i8>, <16 x i8>* %ptr1_i, align 1 - %b = load <16 x i8>, <16 x i8>* %ptr2_i, align 1 + %a = load <16 x i8>, ptr %ptr1_i, align 1 + %b = load <16 x i8>, ptr %ptr2_i, align 1 %vabd = tail call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %a, <16 x i8> %b) %acc_next = add <16 x i8> %acc_phi, %vabd %next_i = add i32 %i, 16 @@ -342,8 +342,8 @@ loop: %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1 - %b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1 + %a = load <8 x i16>, ptr %ptr1_i, align 1 + %b = load <8 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %a, <8 x i16> %b) %acc_next = add <8 x i16> %acc_phi, %vabd %next_i = add i32 %i, 8 @@ -377,8 +377,8 @@ loop: %acc_phi = phi <8 x i8> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1 - %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1 + %a = load <8 x i8>, ptr %ptr1_i, align 1 + %b = load <8 x i8>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) %acc_next = add <8 x i8> %acc_phi, %vabd %next_i = add i32 %i, 8 @@ -411,8 +411,8 @@ loop: %acc_phi = phi <4 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1 - %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1 + %a = load <4 x i16>, ptr %ptr1_i, align 1 + %b = load <4 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b) %acc_next = add <4 x i16> %acc_phi, %vabd %next_i = add i32 %i, 4 @@ -445,8 +445,8 @@ loop: %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <8 x i16>, <8 x i16>* %ptr1_i, align 1 - %b = load <8 x i16>, <8 x i16>* %ptr2_i, align 1 + %a = load <8 x i16>, ptr %ptr1_i, align 1 + %b = load <8 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %a, <8 x i16> %b) %acc_next = add <8 x i16> %acc_phi, %vabd %next_i = add i32 %i, 8 @@ -480,8 +480,8 @@ loop: %acc_phi = phi <8 x i16> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i8, ptr %ptr1, i32 %i %ptr2_i = getelementptr i8, ptr %ptr2, i32 %i - %a = load <8 x i8>, <8 x i8>* %ptr1_i, align 1 - %b = load <8 x i8>, <8 x i8>* %ptr2_i, align 1 + %a = load <8 x i8>, ptr %ptr1_i, align 1 + %b = load <8 x i8>, ptr %ptr2_i, align 1 %vabd = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) %vmov = zext <8 x i8> %vabd to <8 x i16> %acc_next = add <8 x i16> %vmov, %acc_phi @@ -516,8 +516,8 @@ loop: %acc_phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %acc_next, %loop ] %ptr1_i = getelementptr i16, ptr %ptr1, i32 %i %ptr2_i = getelementptr i16, ptr %ptr2, i32 %i - %a = load <4 x i16>, <4 x i16>* %ptr1_i, align 1 - %b = load <4 x i16>, <4 x i16>* %ptr2_i, align 1 + %a = load <4 x i16>, ptr %ptr1_i, align 1 + %b = load <4 x i16>, ptr %ptr2_i, align 1 %vabd = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b) %vmov = zext <4 x i16> %vabd to <4 x i32> %acc_next = add <4 x i32> %vmov, %acc_phi diff --git a/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir new file mode 100644 index 0000000000000..8c31e7c2d1cec --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-regress-opt-cmp-signed.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s | FileCheck %s +--- | + define i32 @test01() nounwind { + entry: + %0 = select i1 true, i32 1, i32 0 + %1 = and i32 %0, 65535 + %2 = icmp sgt i32 %1, 0 + br i1 %2, label %if.then, label %if.end + + if.then: ; preds = %entry + ret i32 1 + + if.end: ; preds = %entry + ret i32 0 + } +... +--- +name: test01 +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: gpr32common } +body: | + ; CHECK-LABEL: name: test01 + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1 + ; CHECK-NEXT: [[ANDSWri:%[0-9]+]]:gpr32common = ANDSWri killed [[ANDSWri]], 15, implicit-def $nzcv + ; CHECK-NEXT: Bcc 12, %bb.2, implicit $nzcv + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.if.then: + ; CHECK-NEXT: $w0 = MOVi32imm 1 + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.end: + ; CHECK-NEXT: $w0 = MOVi32imm 0 + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + bb.0.entry: + successors: %bb.2.if.end, %bb.1.if.then + + %0 = MOVi32imm 1 + %1 = ANDWri killed %1, 15 + $wzr = SUBSWri killed %1, 0, 0, implicit-def $nzcv + Bcc 12, %bb.2.if.end, implicit $nzcv + + bb.1.if.then: + $w0 = MOVi32imm 1 + RET_ReallyLR implicit $w0 + + bb.2.if.end: + $w0 = MOVi32imm 0 + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll index dbbfbea9176f6..f725c19081deb 100644 --- a/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-two-dup.ll @@ -188,11 +188,11 @@ entry: define <8 x i8> @test11(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LABEL: test11: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ld1r { v1.8b }, [x0] -; CHECK-NEXT: ld1r { v2.8b }, [x1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.h[2], v2.h[0] -; CHECK-NEXT: mov v0.h[3], v1.h[0] +; CHECK-NEXT: ld1r { v0.8b }, [x0] +; CHECK-NEXT: ld1r { v1.8b }, [x1] +; CHECK-NEXT: fmov d2, d0 +; CHECK-NEXT: mov v0.h[2], v1.h[0] +; CHECK-NEXT: mov v0.h[3], v2.h[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll index 608fe29e17398..d421b3f17caf8 100644 --- a/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll +++ b/llvm/test/CodeGen/AArch64/cgdata-merge-local.ll @@ -54,9 +54,9 @@ define i32 @f1(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g1, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g1, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add @@ -65,9 +65,9 @@ entry: define i32 @f2(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g2, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g2, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add diff --git a/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll b/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll index 10f0e10f11d66..a9da1253de01d 100644 --- a/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll +++ b/llvm/test/CodeGen/AArch64/cgdata-merge-no-params.ll @@ -19,9 +19,9 @@ define i32 @f1(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g1, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g1, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add @@ -30,9 +30,9 @@ entry: define i32 @f2(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g1, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g1, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll index 9986af7eb231c..7ab2aba8d75e2 100644 --- a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll +++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll @@ -12,9 +12,9 @@ define i32 @0(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g1, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g1, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add @@ -23,9 +23,9 @@ entry: define i32 @1(i32 %a) { entry: %idxprom = sext i32 %a to i64 - %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom - %0 = load i32, i32* %arrayidx, align 4 - %1 = load volatile i32, i32* @g2, align 4 + %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom + %0 = load i32, ptr %arrayidx, align 4 + %1 = load volatile i32, ptr @g2, align 4 %mul = mul nsw i32 %1, %0 %add = add nsw i32 %mul, 1 ret i32 %add diff --git a/llvm/test/CodeGen/AArch64/divrem.ll b/llvm/test/CodeGen/AArch64/divrem.ll index 5cd7e098d00bb..e3cbd17dc4c3f 100644 --- a/llvm/test/CodeGen/AArch64/divrem.ll +++ b/llvm/test/CodeGen/AArch64/divrem.ll @@ -2,7 +2,7 @@ ; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and ; should not generate select error. -define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) { +define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, ptr %z) { ; CHECK-LABEL: test_udivrem ; CHECK-DAG: udivrem ; CHECK-NOT: LLVM ERROR: Cannot select @@ -12,10 +12,10 @@ define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) { ret <2 x i32> %1 } -define <4 x i32> @test_sdivrem(<4 x i32> %x, ptr %y) { +define <4 x i32> @test_sdivrem(<4 x i32> %x, ptr %y) { ; CHECK-LABEL: test_sdivrem ; CHECK-DAG: sdivrem - %div = sdiv <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 > + %div = sdiv <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 > store <4 x i32> %div, ptr %y %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 > ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll index 33c5ba7987974..8297fa2d4e3f9 100644 --- a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll +++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll @@ -161,6 +161,338 @@ define i1 @lt64_u16_and_23(i64 %0) { ret i1 %3 } +define i1 @test_disjoint(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp eq i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp sgt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: tst w9, w8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp slt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint4(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %5 = and i32 %3, %4 + %6 = icmp sle i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_4(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint_inverse_4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bic w8, w9, w8 +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp sle i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp eq i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint2_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp sgt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_inverse(i1 %0, i32 %1, i32 %2) { +; CHECK-LABEL: test_disjoint3_inverse: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr w9, w2, #0x800000 +; CHECK-NEXT: lsl w8, w8, w1 +; CHECK-NEXT: bics wzr, w9, w8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i32 %2, 8388608 + %4 = shl nuw i32 1, %1 + %not = xor i32 %4, -1 + %5 = and i32 %3, %not + %6 = icmp slt i32 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp eq i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint2_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp sgt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint3_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: tst x9, x8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp slt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint4_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint4_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: and x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %5 = and i64 %3, %4 + %6 = icmp sle i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_4_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_inverse_4_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bic x8, x9, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp sle i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp eq i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint2_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint2_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp sgt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + +define i1 @test_disjoint3_inverse_64(i1 %0, i64 %1, i64 %2) { +; CHECK-LABEL: test_disjoint3_inverse_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: orr x9, x2, #0x80000000000000 +; CHECK-NEXT: lsl x8, x8, x1 +; CHECK-NEXT: bics xzr, x9, x8 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: orr w8, w0, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret +entry: + %3 = or disjoint i64 %2, 36028797018963968 + %4 = shl nuw i64 1, %1 + %not = xor i64 %4, -1 + %5 = and i64 %3, %not + %6 = icmp slt i64 %5, 0 + %7 = select i1 %0, i1 true, i1 %6 + ret i1 %7 +} + ; negative test define i1 @lt3_u8(i8 %0) { ; CHECK-LABEL: lt3_u8: diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll index 91cf605613b9e..c0c8894ce1f6b 100644 --- a/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll +++ b/llvm/test/CodeGen/AArch64/ldp-stp-scaled-unscaled-pairs.ll @@ -85,7 +85,7 @@ define i64 @test_ldrsw_ldursw(ptr %p) #0 { ; CHECK-NEXT: add.2d v0, v[[V0]], v[[V1]] ; CHECK-NEXT: ret define <2 x i64> @test_ldrq_ldruq_invalidoffset(ptr %p) #0 { - %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8 + %tmp1 = load <2 x i64>, ptr %p, align 8 %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 3 %tmp2 = load <2 x i64>, ptr %add.ptr2, align 8 %add = add nsw <2 x i64> %tmp1, %tmp2 diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir b/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir new file mode 100644 index 0000000000000..c96a0385c3a4e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/machine-combiner-subregs.mir @@ -0,0 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=aarch64-gnu-linux -mcpu=neoverse-n2 -run-pass=machine-combiner -o - %s | FileCheck %s + +# Make sure machine combiner doesn't drop subregister indexes. + +--- +name: reassociate_adds2_reassoc +tracksRegLiveness: true +body: | + bb.0: + liveins: $q0, $q1, $q2, $q3 + + ; CHECK-LABEL: name: reassociate_adds2_reassoc + ; CHECK: liveins: $q0, $q1, $q2, $q3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr128 = COPY $q2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr128 = COPY $q3 + ; CHECK-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr [[COPY]].ssub, [[COPY1]].ssub, implicit $fpcr + ; CHECK-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr [[COPY2]].ssub, [[COPY3]].ssub, implicit $fpcr + ; CHECK-NEXT: [[FADDSrr2:%[0-9]+]]:fpr32 = nsz reassoc nofpexcept FADDSrr killed [[FADDSrr1]], killed [[FADDSrr]], implicit $fpcr + ; CHECK-NEXT: $s0 = COPY [[FADDSrr2]] + ; CHECK-NEXT: RET_ReallyLR implicit $s0 + %0:fpr128 = COPY $q0 + %1:fpr128 = COPY $q1 + %2:fpr128 = COPY $q2 + %3:fpr128 = COPY $q3 + %4:fpr32 = nsz reassoc nofpexcept FADDSrr %0.ssub, %1.ssub, implicit $fpcr + %5:fpr32 = nsz reassoc nofpexcept FADDSrr %2.ssub, killed %4, implicit $fpcr + %6:fpr32 = nsz reassoc nofpexcept FADDSrr killed %5, %3.ssub, implicit $fpcr + $s0 = COPY %6 + RET_ReallyLR implicit $s0 + +... diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll index 3230c9e946da7..b3a7ec961b736 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sink-instr.ll @@ -20,20 +20,17 @@ define i32 @sink_load_and_copy(i32 %n) { ; CHECK-NEXT: b.lt .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: adrp x8, A -; CHECK-NEXT: mov w20, w19 -; CHECK-NEXT: ldr w21, [x8, :lo12:A] +; CHECK-NEXT: mov w21, w19 +; CHECK-NEXT: ldr w20, [x8, :lo12:A] ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov w0, w21 +; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w20, w20, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB0_2 -; CHECK-NEXT: b .LBB0_4 -; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: mov w20, w19 -; CHECK-NEXT: .LBB0_4: // %for.cond.cleanup -; CHECK-NEXT: mov w0, w20 +; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -82,15 +79,12 @@ define i32 @cant_sink_successive_call(i32 %n) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w21, w21, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB1_2 -; CHECK-NEXT: b .LBB1_4 -; CHECK-NEXT: .LBB1_3: -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: .LBB1_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB1_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -139,15 +133,12 @@ define i32 @cant_sink_successive_store(ptr nocapture readnone %store, i32 %n) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov w0, w20 ; CHECK-NEXT: bl _Z3usei -; CHECK-NEXT: sdiv w21, w21, w0 -; CHECK-NEXT: subs w19, w19, #1 +; CHECK-NEXT: sdiv w19, w19, w0 +; CHECK-NEXT: subs w21, w21, #1 ; CHECK-NEXT: b.ne .LBB2_2 -; CHECK-NEXT: b .LBB2_4 -; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov w21, w19 -; CHECK-NEXT: .LBB2_4: // %for.cond.cleanup +; CHECK-NEXT: .LBB2_3: // %for.cond.cleanup +; CHECK-NEXT: mov w0, w19 ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: mov w0, w21 ; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir index b7fbdc09c1dd1..a635231fef7fb 100644 --- a/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir +++ b/llvm/test/CodeGen/AArch64/machine-outliner-iterative.mir @@ -6,9 +6,9 @@ # #; define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) { # ... ... ... -# %8 = load i1, i1* %7 %8 = load i1, i1* %7 -# %9 = load i4, i4*, %6 %9 = load i4, i4*, %6 %9 = load i4, i4*, %6 -# store i4 %9, i4* %5 store i4 %9, i4* %5 store i4 %9, i4* %5 +# %8 = load i1, ptr %7 %8 = load i1, ptr %7 +# %9 = load i4, ptr, %6 %9 = load i4, ptr, %6 %9 = load i4, ptr, %6 +# store i4 %9, ptr %5 store i4 %9, ptr %5 store i4 %9, ptr %5 # ... ... ... # } } } # @@ -16,7 +16,7 @@ # # define void @"$s12"(...) { define i64 @"$s5” (...) { define void @"$s13"(...) { # ... ... ... -# %8 = load i1, i1* %7 %8 = load i1, i1* %7 +# %8 = load i1, ptr %7 %8 = load i1, ptr %7 # call void @outlined_function_1_1 call void @outlined_function_1_1 call void @outlined_function_1_1 # ... ... ... # } } } diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll index e7e109170d6a1..338084295fc7f 100644 --- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll +++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll @@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) { ; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: LBB0_1: ; %.thread ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lsr w11, w9, #1 ; CHECK-NEXT: sub w10, w9, #1 -; CHECK-NEXT: mov w9, w11 +; CHECK-NEXT: lsr w9, w9, #1 ; CHECK-NEXT: tbnz w10, #0, LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb343 ; CHECK-NEXT: and w9, w10, #0x1 -; CHECK-NEXT: mov w0, #-1 +; CHECK-NEXT: mov w0, #-1 ; =0xffffffff ; CHECK-NEXT: str w9, [x8] ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll b/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll index 700a060ef968f..0a10e80d998cd 100644 --- a/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll +++ b/llvm/test/CodeGen/AArch64/misched-fusion-cmp-bcc.ll @@ -15,10 +15,10 @@ ; RUN: llc %s -o - -O0 -mtriple=aarch64-unknown -mcpu=ampere1b | FileCheck %s -define void @test_cmp_bcc_fusion(i32 %x, i32 %y, i32* %arr) { +define void @test_cmp_bcc_fusion(i32 %x, i32 %y, ptr %arr) { entry: %cmp = icmp eq i32 %x, %y - store i32 %x, i32* %arr, align 4 + store i32 %x, ptr %arr, align 4 br i1 %cmp, label %if_true, label %if_false if_true: diff --git a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll index b7dde881291bb..1a85f803b9e57 100644 --- a/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll +++ b/llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll @@ -19,7 +19,7 @@ define void @test_nopair_st(ptr %ptr, <2 x double> %v1, <2 x double> %v2) { ; SLOW-NOT: ldp ; FAST: ldp define <2 x i64> @test_nopair_ld(ptr %p) { - %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8 + %tmp1 = load <2 x i64>, ptr %p, align 8 %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2 %tmp2 = load <2 x i64>, ptr %add.ptr2, align 8 %add = add nsw <2 x i64> %tmp1, %tmp2 diff --git a/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll b/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll index 0356a46ec1050..df5e1a9f1ee10 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-bti-call.ll @@ -17,7 +17,7 @@ ; CHECK-NEXT: bti c ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: braaz x16 -define i32 @test_tailcall_ia_0(i32 ()* %arg0) #0 { +define i32 @test_tailcall_ia_0(ptr %arg0) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 0) ] ret i32 %tmp0 } @@ -26,7 +26,7 @@ define i32 @test_tailcall_ia_0(i32 ()* %arg0) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: brabz x16 -define i32 @test_tailcall_ib_0(i32 ()* %arg0) #0 { +define i32 @test_tailcall_ib_0(ptr %arg0) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 0) ] ret i32 %tmp0 } @@ -36,7 +36,7 @@ define i32 @test_tailcall_ib_0(i32 ()* %arg0) #0 { ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: mov x17, #42 ; CHECK-NEXT: braa x16, x17 -define i32 @test_tailcall_ia_imm(i32 ()* %arg0) #0 { +define i32 @test_tailcall_ia_imm(ptr %arg0) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 42) ] ret i32 %tmp0 } @@ -46,7 +46,7 @@ define i32 @test_tailcall_ia_imm(i32 ()* %arg0) #0 { ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: mov x17, #42 ; CHECK-NEXT: brab x16, x17 -define i32 @test_tailcall_ib_imm(i32 ()* %arg0) #0 { +define i32 @test_tailcall_ib_imm(ptr %arg0) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 42) ] ret i32 %tmp0 } @@ -60,8 +60,8 @@ define i32 @test_tailcall_ib_imm(i32 ()* %arg0) #0 { ; ELF-NEXT: ldr x1, [x1] ; ELF-NEXT: mov x16, x0 ; ELF-NEXT: braa x16, x1 -define i32 @test_tailcall_ia_var(i32 ()* %arg0, i64* %arg1) #0 { - %tmp0 = load i64, i64* %arg1 +define i32 @test_tailcall_ia_var(ptr %arg0, ptr %arg1) #0 { + %tmp0 = load i64, ptr %arg1 %tmp1 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 %tmp0) ] ret i32 %tmp1 } @@ -75,8 +75,8 @@ define i32 @test_tailcall_ia_var(i32 ()* %arg0, i64* %arg1) #0 { ; ELF-NEXT: ldr x1, [x1] ; ELF-NEXT: mov x16, x0 ; ELF-NEXT: brab x16, x1 -define i32 @test_tailcall_ib_var(i32 ()* %arg0, i64* %arg1) #0 { - %tmp0 = load i64, i64* %arg1 +define i32 @test_tailcall_ib_var(ptr %arg0, ptr %arg1) #0 { + %tmp0 = load i64, ptr %arg1 %tmp1 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 %tmp0) ] ret i32 %tmp1 } @@ -85,7 +85,7 @@ define i32 @test_tailcall_ib_var(i32 ()* %arg0, i64* %arg1) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: braa x16, x1 -define i32 @test_tailcall_ia_arg(i32 ()* %arg0, i64 %arg1) #0 { +define i32 @test_tailcall_ia_arg(ptr %arg0, i64 %arg1) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 0, i64 %arg1) ] ret i32 %tmp0 } @@ -94,7 +94,7 @@ define i32 @test_tailcall_ia_arg(i32 ()* %arg0, i64 %arg1) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: mov x16, x0 ; CHECK-NEXT: brab x16, x1 -define i32 @test_tailcall_ib_arg(i32 ()* %arg0, i64 %arg1) #0 { +define i32 @test_tailcall_ib_arg(ptr %arg0, i64 %arg1) #0 { %tmp0 = tail call i32 %arg0() [ "ptrauth"(i32 1, i64 %arg1) ] ret i32 %tmp0 } @@ -103,8 +103,8 @@ define i32 @test_tailcall_ib_arg(i32 ()* %arg0, i64 %arg1) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: ldr x16, [x0] ; CHECK-NEXT: braa x16, x1 -define i32 @test_tailcall_ia_arg_ind(i32 ()** %arg0, i64 %arg1) #0 { - %tmp0 = load i32 ()*, i32 ()** %arg0 +define i32 @test_tailcall_ia_arg_ind(ptr %arg0, i64 %arg1) #0 { + %tmp0 = load ptr, ptr %arg0 %tmp1 = tail call i32 %tmp0() [ "ptrauth"(i32 0, i64 %arg1) ] ret i32 %tmp1 } @@ -113,8 +113,8 @@ define i32 @test_tailcall_ia_arg_ind(i32 ()** %arg0, i64 %arg1) #0 { ; CHECK-NEXT: bti c ; CHECK-NEXT: ldr x16, [x0] ; CHECK-NEXT: brab x16, x1 -define i32 @test_tailcall_ib_arg_ind(i32 ()** %arg0, i64 %arg1) #0 { - %tmp0 = load i32 ()*, i32 ()** %arg0 +define i32 @test_tailcall_ib_arg_ind(ptr %arg0, i64 %arg1) #0 { + %tmp0 = load ptr, ptr %arg0 %tmp1 = tail call i32 %tmp0() [ "ptrauth"(i32 1, i64 %arg1) ] ret i32 %tmp1 } diff --git a/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll b/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll index 9cf77b125e107..950db5fd6381f 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-call-rv-marker.ll @@ -4,18 +4,18 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "arm64e-apple-iphoneos" -declare i8* @foo0(i32) -declare i8* @foo1() +declare ptr @foo0(i32) +declare ptr @foo1() -declare void @llvm.objc.release(i8*) -declare i8* @llvm.objc.retainAutoreleasedReturnValue(i8*) -declare i8* @llvm.objc.unsafeClaimAutoreleasedReturnValue(i8*) +declare void @llvm.objc.release(ptr) +declare ptr @llvm.objc.retainAutoreleasedReturnValue(ptr) +declare ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue(ptr) -declare void @foo2(i8*) +declare void @foo2(ptr) declare void @foo(i64, i64, i64) -define void @rv_marker_ptrauth_blraa(i8* ()** %arg0, i64 %arg1) { +define void @rv_marker_ptrauth_blraa(ptr %arg0, i64 %arg1) { ; CHECK-LABEL: rv_marker_ptrauth_blraa ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blraa [[ADDR]], x1 @@ -23,14 +23,14 @@ define void @rv_marker_ptrauth_blraa(i8* ()** %arg0, i64 %arg1) { ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; entry: - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blraa_unsafeClaim(i8* ()** %arg0, i64 %arg1) { +define void @rv_marker_ptrauth_blraa_unsafeClaim(ptr %arg0, i64 %arg1) { ; CHECK-LABEL: rv_marker_ptrauth_blraa_unsafeClaim ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blraa [[ADDR]], x1 @@ -38,14 +38,14 @@ define void @rv_marker_ptrauth_blraa_unsafeClaim(i8* ()** %arg0, i64 %arg1) { ; CHECK-NEXT: bl objc_unsafeClaimAutoreleasedReturnValue ; entry: - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.unsafeClaimAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.unsafeClaimAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blraa_disc_imm16(i8* ()** %arg0) { +define void @rv_marker_ptrauth_blraa_disc_imm16(ptr %arg0) { ; CHECK-LABEL: rv_marker_ptrauth_blraa_disc_imm16 ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: mov x17, #45431 @@ -53,14 +53,14 @@ define void @rv_marker_ptrauth_blraa_disc_imm16(i8* ()** %arg0) { ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 45431), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 45431), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blraa_multiarg(i8* (i64, i64, i64)** %arg0, i64 %arg1, i64 %a, i64 %b, i64 %c) { +define void @rv_marker_ptrauth_blraa_multiarg(ptr %arg0, i64 %arg1, i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: rv_marker_ptrauth_blraa_multiarg ; CHECK: mov [[TMP:x[0-9]+]], x1 ; CHECK-DAG: ldr [[ADDR:x[0-9]+]] @@ -71,28 +71,28 @@ define void @rv_marker_ptrauth_blraa_multiarg(i8* (i64, i64, i64)** %arg0, i64 % ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; entry: - %tmp0 = load i8* (i64, i64, i64)*, i8* (i64, i64, i64)** %arg0 - %call0 = call i8* %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 0, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blrab(i8* ()** %arg0, i64 %arg1) { +define void @rv_marker_ptrauth_blrab(ptr %arg0, i64 %arg1) { ; CHECK-LABEL: rv_marker_ptrauth_blrab ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blrab [[ADDR]], x1 ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 %arg1), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 %arg1), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blrab_disc_imm16(i8* ()** %arg0) { +define void @rv_marker_ptrauth_blrab_disc_imm16(ptr %arg0) { ; CHECK-LABEL: rv_marker_ptrauth_blrab_disc_imm16 ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: mov x17, #256 @@ -100,42 +100,42 @@ define void @rv_marker_ptrauth_blrab_disc_imm16(i8* ()** %arg0) { ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 256), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 256), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blraaz(i8* ()** %arg0) { +define void @rv_marker_ptrauth_blraaz(ptr %arg0) { ; CHECK-LABEL: rv_marker_ptrauth_blraaz ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blraaz [[ADDR]] ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 0, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 0, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blrabz(i8* ()** %arg0) { +define void @rv_marker_ptrauth_blrabz(ptr %arg0) { ; CHECK-LABEL: rv_marker_ptrauth_blrabz ; CHECK: ldr [[ADDR:x[0-9]+]], [ ; CHECK-NEXT: blrabz [[ADDR]] ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* ()*, i8* ()** %arg0 - %call0 = call i8* %tmp0() [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0() [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } -define void @rv_marker_ptrauth_blrabz_multiarg(i8* (i64, i64, i64)** %arg0, i64 %a, i64 %b, i64 %c) { +define void @rv_marker_ptrauth_blrabz_multiarg(ptr %arg0, i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: rv_marker_ptrauth_blrabz_multiarg ; CHECK: mov [[TMP:x[0-9]+]], x1 ; CHECK-DAG: ldr [[ADDR:x[0-9]+]], [ @@ -146,9 +146,9 @@ define void @rv_marker_ptrauth_blrabz_multiarg(i8* (i64, i64, i64)** %arg0, i64 ; CHECK-NEXT: mov x29, x29 ; CHECK-NEXT: bl objc_retainAutoreleasedReturnValue ; - %tmp0 = load i8* (i64, i64, i64)*, i8* (i64, i64, i64)** %arg0 - %call0 = call i8* %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(i8* (i8*)* @llvm.objc.retainAutoreleasedReturnValue) ] - tail call void @foo2(i8* %call0) - tail call void @llvm.objc.release(i8* %call0) + %tmp0 = load ptr, ptr %arg0 + %call0 = call ptr %tmp0(i64 %c, i64 %b, i64 %a) [ "ptrauth"(i32 1, i64 0), "clang.arc.attachedcall"(ptr @llvm.objc.retainAutoreleasedReturnValue) ] + tail call void @foo2(ptr %call0) + tail call void @llvm.objc.release(ptr %call0) ret void } diff --git a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll index 932cc946db0ea..02c643f101913 100644 --- a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll +++ b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll @@ -87,7 +87,7 @@ ; CHECK-MACHO-NEXT: _g.offset.ref.da.0: ; CHECK-MACHO-NEXT: .quad (_g+16)@AUTH(da,0) -@g.offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) +@g.offset.ref.da.0 = constant ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2) ; CHECK-ELF-LABEL: .globl g.big_offset.ref.da.0 ; CHECK-ELF-NEXT: .p2align 3 @@ -99,7 +99,7 @@ ; CHECK-MACHO-NEXT: _g.big_offset.ref.da.0: ; CHECK-MACHO-NEXT: .quad (_g+2147549185)@AUTH(da,0) -@g.big_offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2) +@g.big_offset.ref.da.0 = constant ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2) ; CHECK-ELF-LABEL: .globl g.weird_ref.da.0 ; CHECK-ELF-NEXT: .p2align 3 @@ -111,7 +111,7 @@ ; CHECK-MACHO-NEXT: _g.weird_ref.da.0: ; CHECK-MACHO-NEXT: .quad (_g+16)@AUTH(da,0) -@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64) +@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (ptr getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64) ; CHECK-ELF-LABEL: .globl g_weak.ref.ia.42 ; CHECK-ELF-NEXT: .p2align 3 diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index b947c943ba448..72f6646930624 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -151,12 +151,11 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 @@ -190,12 +189,11 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 @@ -229,12 +227,11 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 @@ -273,12 +270,11 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 @@ -313,12 +309,11 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 @@ -353,12 +348,11 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 @@ -393,12 +387,11 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 @@ -433,12 +426,11 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 @@ -513,12 +505,11 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 @@ -557,12 +548,11 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 @@ -596,12 +586,11 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 @@ -635,12 +624,11 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 @@ -674,12 +662,11 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 @@ -713,12 +700,11 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 @@ -752,12 +738,11 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 @@ -791,12 +776,11 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 @@ -830,12 +814,11 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 -; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill ; CHECK-NEXT: smstop sm ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index f2163ad15bafc..df88f37195ed6 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -129,12 +129,11 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mrs x19, SVCR ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill -; CHECK-NEXT: mrs x19, SVCR -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: tbz w19, #0, .LBB4_2 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: smstop sm diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index 6c6a691760af3..52a77cb396909 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -147,15 +147,15 @@ define <2 x float> @extract_v2f32_nxv16f32_2( %arg) { define <4 x i1> @extract_v4i1_nxv32i1_0( %arg) { ; CHECK-LABEL: extract_v4i1_nxv32i1_0: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.b[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.b[2] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: umov w8, v1.b[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: umov w8, v1.b[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %ext = call <4 x i1> @llvm.vector.extract.v4i1.nxv32i1( %arg, i64 0) ret <4 x i1> %ext diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index e10313773c73e..72994100b2970 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -248,15 +248,15 @@ define <2 x i1> @extract_v2i1_nxv2i1( %inmask) { define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { ; CHECK-LABEL: extract_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <4 x i1> @llvm.vector.extract.v4i1.nxv4i1( %inmask, i64 0) ret <4 x i1> %mask @@ -265,23 +265,23 @@ define <4 x i1> @extract_v4i1_nxv4i1( %inmask) { define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { ; CHECK-LABEL: extract_v8i1_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 -; CHECK-NEXT: umov w8, v1.h[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: umov w9, v1.h[2] +; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.b[1], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v0.b[2], w8 ; CHECK-NEXT: umov w8, v1.h[3] -; CHECK-NEXT: mov v0.b[2], w9 -; CHECK-NEXT: umov w9, v1.h[4] ; CHECK-NEXT: mov v0.b[3], w8 +; CHECK-NEXT: umov w8, v1.h[4] +; CHECK-NEXT: mov v0.b[4], w8 ; CHECK-NEXT: umov w8, v1.h[5] -; CHECK-NEXT: mov v0.b[4], w9 -; CHECK-NEXT: umov w9, v1.h[6] ; CHECK-NEXT: mov v0.b[5], w8 +; CHECK-NEXT: umov w8, v1.h[6] +; CHECK-NEXT: mov v0.b[6], w8 ; CHECK-NEXT: umov w8, v1.h[7] -; CHECK-NEXT: mov v0.b[6], w9 ; CHECK-NEXT: mov v0.b[7], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %mask = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( %inmask, i64 0) ret <8 x i1> %mask @@ -292,9 +292,9 @@ define <8 x i1> @extract_v8i1_nxv8i1( %inmask) { define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { ; CHECK-LABEL: extract_v16i1_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.b[1], v1.b[1] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov v0.b[1], v0.b[1] ; CHECK-NEXT: mov v0.b[2], v1.b[2] ; CHECK-NEXT: mov v0.b[3], v1.b[3] ; CHECK-NEXT: mov v0.b[4], v1.b[4] @@ -309,6 +309,7 @@ define <16 x i1> @extract_v16i1_nxv16i1( %inmask) { ; CHECK-NEXT: mov v0.b[13], v1.b[13] ; CHECK-NEXT: mov v0.b[14], v1.b[14] ; CHECK-NEXT: mov v0.b[15], v1.b[15] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %mask = call <16 x i1> @llvm.vector.extract.v16i1.nxv16i1( %inmask, i64 0) ret <16 x i1> %mask diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll index 41e4a38fad90b..8e807cda7166d 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-reshuffle.ll @@ -8,15 +8,15 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i1> @reshuffle_v4i1_nxv4i1( %a) #0 { ; CHECK-LABEL: reshuffle_v4i1_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 -; CHECK-NEXT: mov w8, v1.s[1] -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov w9, v1.s[2] +; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v0.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v0.h[2], w8 ; CHECK-NEXT: mov w8, v1.s[3] -; CHECK-NEXT: mov v0.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %el0 = extractelement %a, i32 0 %el1 = extractelement %a, i32 1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index ba4a3a2042305..bd8f432579a08 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -28,53 +28,53 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK: // %bb.0: ; CHECK-NEXT: tbnz w1, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %vector.body +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: umov w8, v0.b[8] -; CHECK-NEXT: mov v1.b[1], v0.b[1] -; CHECK-NEXT: movprfx z3, z0 -; CHECK-NEXT: ext z3.b, z3.b, z0.b, #16 +; CHECK-NEXT: umov w8, v2.b[8] +; CHECK-NEXT: mov v0.b[1], v2.b[1] +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #16 ; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v1.b[2], v0.b[2] -; CHECK-NEXT: mov v2.b[1], v0.b[9] -; CHECK-NEXT: mov v1.b[3], v0.b[3] -; CHECK-NEXT: mov v2.b[2], v0.b[10] -; CHECK-NEXT: mov v1.b[4], v0.b[4] -; CHECK-NEXT: mov v2.b[3], v0.b[11] -; CHECK-NEXT: mov v1.b[5], v0.b[5] -; CHECK-NEXT: mov v2.b[4], v0.b[12] -; CHECK-NEXT: mov v1.b[6], v0.b[6] -; CHECK-NEXT: mov v2.b[5], v0.b[13] -; CHECK-NEXT: mov v1.b[7], v0.b[7] -; CHECK-NEXT: mov v2.b[6], v0.b[14] -; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: mov v2.b[7], v0.b[15] -; CHECK-NEXT: uunpklo z0.h, z3.b +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.b[2], v2.b[2] +; CHECK-NEXT: mov v1.b[1], v2.b[9] +; CHECK-NEXT: mov v0.b[3], v2.b[3] +; CHECK-NEXT: mov v1.b[2], v2.b[10] +; CHECK-NEXT: mov v0.b[4], v2.b[4] +; CHECK-NEXT: mov v1.b[3], v2.b[11] +; CHECK-NEXT: mov v0.b[5], v2.b[5] +; CHECK-NEXT: mov v1.b[4], v2.b[12] +; CHECK-NEXT: mov v0.b[6], v2.b[6] +; CHECK-NEXT: mov v1.b[5], v2.b[13] +; CHECK-NEXT: mov v0.b[7], v2.b[7] +; CHECK-NEXT: mov v1.b[6], v2.b[14] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: mov v1.b[7], v2.b[15] +; CHECK-NEXT: uunpklo z2.h, z3.b ; CHECK-NEXT: uunpklo z3.h, z4.b -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: lsl z3.s, z3.s, #31 -; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 +; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: asr z3.s, z3.s, #31 -; CHECK-NEXT: lsl z2.s, z2.s, #31 -; CHECK-NEXT: cmpne p3.s, p0/z, z1.s, #0 -; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: cmpne p3.s, p0/z, z0.s, #0 +; CHECK-NEXT: cmpne p1.s, p0/z, z2.s, #0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 -; CHECK-NEXT: asr z2.s, z2.s, #31 -; CHECK-NEXT: cmpne p0.s, p0/z, z2.s, #0 -; CHECK-NEXT: st1w { z0.s }, p1, [x0, #2, mul vl] -; CHECK-NEXT: st1w { z0.s }, p2, [x0, #3, mul vl] -; CHECK-NEXT: st1w { z0.s }, p3, [x0] -; CHECK-NEXT: st1w { z0.s }, p0, [x0, #1, mul vl] +; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: st1w { z2.s }, p1, [x0, #2, mul vl] +; CHECK-NEXT: st1w { z2.s }, p2, [x0, #3, mul vl] +; CHECK-NEXT: st1w { z2.s }, p3, [x0] +; CHECK-NEXT: st1w { z2.s }, p0, [x0, #1, mul vl] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll index 124f81e7864d1..39fe92aae0619 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll @@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) { ; CHECK-NEXT: whilelt p0.s, wzr, w0 ; CHECK-NEXT: b.pl .LBB0_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: cntw x9 +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: cntw x8 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: whilelt p0.s, w8, w0 -; CHECK-NEXT: add w8, w8, w9 +; CHECK-NEXT: whilelt p0.s, w9, w0 +; CHECK-NEXT: add w9, w9, w8 ; CHECK-NEXT: b.mi .LBB0_2 ; CHECK-NEXT: .LBB0_3: // %exit ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 74a717f1635a3..935189dec48ac 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2835,11 +2835,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB24_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -2847,11 +2847,11 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB24_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -2950,26 +2950,26 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB25_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x0] -; CHECK-BE-NEXT: add x9, x1, #48 -; CHECK-BE-NEXT: add x8, x1, #32 -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] +; CHECK-BE-NEXT: add x10, x1, #48 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x1] +; CHECK-BE-NEXT: add x9, x1, #32 +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] ; CHECK-BE-NEXT: add x1, x1, #16 -; CHECK-BE-NEXT: ld1 { v20.4s }, [x8] +; CHECK-BE-NEXT: ld1 { v20.4s }, [x9] ; CHECK-BE-NEXT: ld1 { v22.4s }, [x1] -; CHECK-BE-NEXT: add x8, x0, #96 +; CHECK-BE-NEXT: add x9, x0, #96 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v3.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v2.16b ; CHECK-BE-NEXT: tbl v7.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v4.16b, { v4.16b }, v0.16b ; CHECK-BE-NEXT: ext v24.16b, v18.16b, v18.16b, #8 -; CHECK-BE-NEXT: add x9, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ext v25.16b, v20.16b, v20.16b, #8 -; CHECK-BE-NEXT: add x10, x0, #16 +; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v5.8b, v5.8b +; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b ; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 @@ -2986,22 +2986,22 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s ; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: st1 { v5.2d }, [x9] ; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s ; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s -; CHECK-BE-NEXT: add x8, x0, #112 +; CHECK-BE-NEXT: add x9, x0, #112 ; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s -; CHECK-BE-NEXT: st1 { v18.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #80 +; CHECK-BE-NEXT: st1 { v18.2d }, [x10] +; CHECK-BE-NEXT: add x10, x0, #80 ; CHECK-BE-NEXT: st1 { v22.2d }, [x0] -; CHECK-BE-NEXT: st1 { v17.2d }, [x8] -; CHECK-BE-NEXT: add x8, x0, #64 -; CHECK-BE-NEXT: st1 { v19.2d }, [x9] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: mov x0, x8 -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] +; CHECK-BE-NEXT: add x0, x0, #64 +; CHECK-BE-NEXT: st1 { v17.2d }, [x9] +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: add x8, x8, #16 +; CHECK-BE-NEXT: st1 { v19.2d }, [x10] +; CHECK-BE-NEXT: st1 { v5.2d }, [x0] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] -; CHECK-BE-NEXT: st1 { v4.2d }, [x10] +; CHECK-BE-NEXT: st1 { v4.2d }, [x8] ; CHECK-BE-NEXT: b.ne .LBB25_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3093,13 +3093,14 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB26_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v4.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #32 +; CHECK-BE-NEXT: mov x8, x0 +; CHECK-BE-NEXT: add x9, x0, #32 ; CHECK-BE-NEXT: ld1 { v16.4s }, [x0] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #16 -; CHECK-BE-NEXT: ld1 { v17.4s }, [x8] -; CHECK-BE-NEXT: ld1 { v18.4s }, [x9] -; CHECK-BE-NEXT: ld1 { v19.4s }, [x10] +; CHECK-BE-NEXT: add x10, x0, #48 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: ld1 { v17.4s }, [x9] +; CHECK-BE-NEXT: ld1 { v18.4s }, [x10] +; CHECK-BE-NEXT: ld1 { v19.4s }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: tbl v5.16b, { v4.16b }, v1.16b ; CHECK-BE-NEXT: tbl v6.16b, { v4.16b }, v3.16b @@ -3113,11 +3114,10 @@ define i32 @test_widening_instr_mull_2(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: mul v6.4s, v17.4s, v6.4s ; CHECK-BE-NEXT: mul v7.4s, v18.4s, v7.4s ; CHECK-BE-NEXT: mul v4.4s, v19.4s, v4.4s -; CHECK-BE-NEXT: st1 { v5.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x10 -; CHECK-BE-NEXT: st1 { v6.4s }, [x8] -; CHECK-BE-NEXT: st1 { v7.4s }, [x9] -; CHECK-BE-NEXT: st1 { v4.4s }, [x10] +; CHECK-BE-NEXT: st1 { v5.4s }, [x8] +; CHECK-BE-NEXT: st1 { v6.4s }, [x9] +; CHECK-BE-NEXT: st1 { v7.4s }, [x10] +; CHECK-BE-NEXT: st1 { v4.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB26_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr @@ -3246,11 +3246,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: .LBB28_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1], #16 -; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: mov x8, x0 ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x9, x0, #48 -; CHECK-BE-NEXT: add x10, x0, #32 +; CHECK-BE-NEXT: add x0, x0, #16 +; CHECK-BE-NEXT: add x9, x8, #48 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x0] ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 @@ -3258,11 +3258,11 @@ define i32 @mul_zext_16i8_sext_16i16(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: smull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: smull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: smull2 v1.4s, v1.8h, v2.8h -; CHECK-BE-NEXT: st1 { v4.4s }, [x0] -; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v4.4s }, [x8] +; CHECK-BE-NEXT: add x8, x8, #32 ; CHECK-BE-NEXT: st1 { v5.4s }, [x9] -; CHECK-BE-NEXT: st1 { v0.4s }, [x10] -; CHECK-BE-NEXT: st1 { v1.4s }, [x8] +; CHECK-BE-NEXT: st1 { v0.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x0] ; CHECK-BE-NEXT: b.ne .LBB28_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: mov w0, wzr diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 30a78648c186a..39618b05e6c71 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -368,7 +368,10 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) { define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) { ; GCN-LABEL: test_clamp_bf16_folding: ; GCN: ; %bb.0: -; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp +; GCN-NEXT: v_exp_bf16_e32 v0, v0 +; GCN-NEXT: v_nop +; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp ; GCN-NEXT: ; return to shader part epilog %exp = call bfloat @llvm.exp2.bf16(bfloat %src) %max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0) diff --git a/llvm/test/CodeGen/AMDGPU/cc-entry.ll b/llvm/test/CodeGen/AMDGPU/cc-entry.ll new file mode 100644 index 0000000000000..5aa146ce87708 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cc-entry.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define amdgpu_kernel void @entry_fn() { +; CHECK-LABEL: entry_fn: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_sext_i32_i16 s5, s5 +; CHECK-NEXT: s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+8 +; CHECK-NEXT: s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+16 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: s_endpgm +entry: + call void @entry_fn() + ret void +} + +define void @caller() { +; CHECK-LABEL: caller: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: s_wait_expcnt 0x0 +; CHECK-NEXT: s_wait_samplecnt 0x0 +; CHECK-NEXT: s_wait_bvhcnt 0x0 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_mov_b32 s0, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: v_writelane_b32 v40, s0, 2 +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 +; CHECK-NEXT: s_add_co_i32 s32, s32, 16 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 +; CHECK-NEXT: s_mov_b64 s[0:1], s[4:5] +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_sext_i32_i16 s5, s5 +; CHECK-NEXT: s_add_co_u32 s4, s4, entry_fn@gotpcrel32@lo+12 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_add_co_ci_u32 s5, s5, entry_fn@gotpcrel32@hi+24 +; CHECK-NEXT: v_mov_b32_e32 v0, v31 +; CHECK-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; CHECK-NEXT: s_mov_b64 s[2:3], s[6:7] +; CHECK-NEXT: s_mov_b64 s[6:7], s[10:11] +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_readlane_b32 s0, v40, 2 +; CHECK-NEXT: s_or_saveexec_b32 s1, -1 +; CHECK-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_mov_b32 exec_lo, s1 +; CHECK-NEXT: s_mov_b32 s33, s0 +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + call void @entry_fn() + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll b/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll new file mode 100644 index 0000000000000..85f2ec1238179 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ds-read2-write2-debug-info.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=debugify < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck %s + +@lds = addrspace(3) global [512 x float] poison, align 4 + +define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 { + %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %in.gep = getelementptr float, ptr addrspace(1) %in, i32 %x.i + %val = load float, ptr addrspace(1) %in.gep, align 4 + %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i + store float %val, ptr addrspace(3) %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x + store float %val, ptr addrspace(3) %arrayidx1, align 4 + ret void +} + +define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 { + %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i + %val0 = load float, ptr addrspace(3) %arrayidx0, align 4 + %add.x = add nsw i32 %x.i, 8 + %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x + %val1 = load float, ptr addrspace(3) %arrayidx1, align 4 + %sum = fadd float %val0, %val1 + %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i + store float %sum, ptr addrspace(1) %out.gep, align 4 + ret void +} + +attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll index c552f9d283597..88a51e9ccf04c 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll +++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll @@ -1,10 +1,13 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --stress-regalloc=10 < %s | FileCheck -check-prefix=GCN %s +; Rematerialization test for fp64 constants (w/ intentionally high register pressure). +; Check to make sure we have at least six constant MOVs, not necessarily consecutive, inside the loop. + ; GCN-LABEL: {{^}}test_remat_sgpr: ; GCN-NOT: v_writelane_b32 -; GCN-COUNT-4: s_mov_b32 s{{[0-9]+}}, 0x ; GCN: {{^}}[[LOOP:.LBB[0-9_]+]]: +; GCN-COUNT-6: {{s_mov_b32|v_mov_b32_e32}} {{[sv]}}{{[0-9]+}}, 0x ; GCN-NOT: v_writelane_b32 ; GCN: s_cbranch_{{[^ ]+}} [[LOOP]] ; GCN: .sgpr_spill_count: 0 diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll index 2d54ac8283a3a..9686c9d30b97c 100644 --- a/llvm/test/CodeGen/AMDGPU/spillv16.ll +++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,+d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W32 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16,-d16-write-vgpr32 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX12-TRUE16,GFX12-TRUE16-D16W16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16 @@ -35,6 +37,26 @@ define void @spill_i16_alu() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_i16_alu: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-TRUE16-LABEL: spill_i16_alu: ; GFX1250-TRUE16: ; %bb.0: ; %entry ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -126,6 +148,56 @@ define void @spill_i16_alu_two_vals() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-D16W32-LABEL: spill_i16_alu_two_vals: +; GFX12-TRUE16-D16W32: ; %bb.0: ; %entry +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX12-TRUE16-D16W32-NEXT: ;;#ASMSTART +; GFX12-TRUE16-D16W32-NEXT: ;;#ASMEND +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-D16W32-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W32-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W32-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-D16W16-LABEL: spill_i16_alu_two_vals: +; GFX12-TRUE16-D16W16: ; %bb.0: ; %entry +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill +; GFX12-TRUE16-D16W16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-D16W16-NEXT: ;;#ASMEND +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-D16W16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l +; GFX12-TRUE16-D16W16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-D16W16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-D16W16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals: ; GFX1250-TRUE16: ; %bb.0: ; %entry ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -223,6 +295,25 @@ define void @spill_i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-LABEL: spill_i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -282,6 +373,25 @@ define void @spill_half() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_half: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-LABEL: spill_half: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -341,6 +451,25 @@ define void @spill_i16_from_v2i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_i16_from_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-LABEL: spill_i16_from_v2i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -414,13 +543,39 @@ define void @spill_2xi16_from_v2i16() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16: ; GFX1250-TRUE16: ; %bb.0: ; %entry ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_clause 0x1 +; GFX1250-TRUE16-NEXT: s_clause 0x1 ; 4-byte Folded Spill ; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -444,7 +599,7 @@ define void @spill_2xi16_from_v2i16() { ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_clause 0x1 +; GFX1250-FAKE16-NEXT: s_clause 0x1 ; 4-byte Folded Spill ; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 @@ -520,6 +675,32 @@ define void @spill_2xi16_from_v2i16_one_free_reg() { ; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 th:TH_LOAD_LU ; 2-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg: ; GFX1250-TRUE16: ; %bb.0: ; %entry ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -595,6 +776,25 @@ define void @spill_v2i16() { ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; +; GFX12-TRUE16-LABEL: spill_v2i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill +; GFX12-TRUE16-NEXT: ;;#ASMSTART +; GFX12-TRUE16-NEXT: ;;#ASMEND +; GFX12-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; ; GFX1250-LABEL: spill_v2i16: ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll new file mode 100644 index 0000000000000..22fba8c1d5f8c --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-struct.ll @@ -0,0 +1,59 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for indexed types in dynamically indexed arrays in cbuffers. +; +; struct S { +; float x[2]; +; uint q; +; }; +; cbuffer CB : register(b0) { +; uint32_t3 w[3]; // offset 0, size 12 (+4) * 3 +; S v[3]; // offset 48, size 24 (+8) * 3 +; } +%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }> +%__cblayout_CB = type <{ + <{ + [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], + <3 x i32> + }>, + target("dx.Padding", 4), + <{ + [2 x <{ %S, target("dx.Padding", 8) }>], %S + }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; w[2].z + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: store i32 [[X]], ptr %dst + %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_ptr, i32 40 + %w_load = load i32, ptr addrspace(2) %w_gep, align 4 + store i32 %w_load, ptr %dst, align 4 + + ;; v[2].q + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_ptr, i32 84 + %v_load = load i32, ptr addrspace(2) %v_gep, align 4 + %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %v_load, ptr %v.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll new file mode 100644 index 0000000000000..615fc5ea07eca --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-of-vector.ll @@ -0,0 +1,49 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Test for when we have indices into both the array and the vector: ie, s[1][3] + +; cbuffer CB : register(b0) { +; uint4 s[3]; // offset 0, size 16 * 3 +; } +%__cblayout_CB = type <{ [2 x <4 x i32>] }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; s[1][3] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: store i32 [[X]], ptr %dst + %i8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %i8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %i8_ptr, i32 28 + %i8_vecext = load i32, ptr addrspace(2) %i8_gep, align 4 + store i32 %i8_vecext, ptr %dst, align 4 + + ;; s[2].w + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ;; + ;; It would be nice to avoid the redundant vector creation here, but that's + ;; outside of the scope of this pass. + ;; + ; CHECK: [[X_VEC:%.*]] = insertelement <4 x i32> {{%.*}}, i32 [[X]], i32 3 + ; CHECK: [[X_EXT:%.*]] = extractelement <4 x i32> [[X_VEC]], i32 3 + ; CHECK: store i32 [[X_EXT]], ptr %dst + %typed_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %typed_gep = getelementptr <4 x i32>, ptr addrspace(2) %typed_ptr, i32 2 + %typed_load = load <4 x i32>, ptr addrspace(2) %typed_gep, align 16 + %typed_vecext = extractelement <4 x i32> %typed_load, i32 3 + store i32 %typed_vecext, ptr %dst, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll new file mode 100644 index 0000000000000..eabc07c2fbb68 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-array-typedgep.ll @@ -0,0 +1,30 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; +; } +%__cblayout_CB = type <{ [2 x <{ float, [12 x i8] }>], float }> + +@CB.cb = global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h = call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ;; a1[1] + ;; Note that the valid GEPs of a1 are `0, 0, 0`, `0, 0, 1`, and `0, 1`. + ; + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1_gep = getelementptr inbounds <{ [2 x <{ float, [12 x i8] }>], float }>, ptr addrspace(2) %a1_ptr, i32 0, i32 0, i32 1 + %a1 = load float, ptr addrspace(2) %a1_gep, align 4 + store float %a1, ptr %dst, align 32 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll new file mode 100644 index 0000000000000..6f6166e820a6f --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-arrays.ll @@ -0,0 +1,145 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB : register(b0) { +; float a1[3]; // offset 0, size 4 (+12) * 3 +; double3 a2[2]; // offset 48, size 24 (+8) * 2 +; float16_t a3[2][2]; // offset 112, size 2 (+14) * 4 +; uint64_t a4[3]; // offset 176, size 8 (+8) * 3 +; int4 a5[2][3][4]; // offset 224, size 16 * 24 +; uint16_t a6[1]; // offset 608, size 2 (+14) * 1 +; int64_t a7[2]; // offset 624, size 8 (+8) * 2 +; bool a8[4]; // offset 656, size 4 (+12) * 4 +; } +%__cblayout_CB = type <{ + <{ [2 x <{ float, target("dx.Padding", 12) }>], float }>, target("dx.Padding", 12), + <{ [1 x <{ <3 x double>, target("dx.Padding", 8) }>], <3 x double> }>, target("dx.Padding", 8), + <{ [3 x <{ half, target("dx.Padding", 14) }>], half }>, target("dx.Padding", 14), + <{ [2 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8), + [24 x <4 x i32>], + [1 x i16], target("dx.Padding", 14), + <{ [1 x <{ i64, target("dx.Padding", 8) }>], i64 }>, target("dx.Padding", 8), + <{ [3 x <{ i32, target("dx.Padding", 12) }>], i32 }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; a1[1] + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[X]], ptr %dst + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a1_ptr, i32 16 + %a1 = load float, ptr addrspace(2) %a1_gep, align 4 + store float %a1, ptr %dst, align 32 + + ;; a2[1] + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %a2_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a2_ptr, i32 32 + %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 8 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store <3 x double> %a2, ptr %a2.i, align 32 + + ;; a3[0][1] + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 8) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 32 + ; CHECK: store half [[X]], ptr [[PTR]] + %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 112) + %a3_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a3_ptr, i32 16 + %a3 = load half, ptr addrspace(2) %a3_gep, align 2 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 32 + store half %a3, ptr %a3.i, align 2 + + ;; a4[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 12) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 176) + %a4_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a4_ptr, i32 16 + %a4 = load i64, ptr addrspace(2) %a4_gep, align 8 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store i64 %a4, ptr %a4.i, align 8 + + ;; a5[1][0][0] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 26) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 224) + %a5_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a5_ptr, i32 192 + %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 4 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ;; a6[0] + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 38) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 64 + ; CHECK: store i16 [[X]], ptr [[PTR]] + %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 608) + %a6 = load i16, ptr addrspace(2) %a6_ptr, align 2 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 64 + store i16 %a6, ptr %a6.i, align 2 + + ;; a7[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 40) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store i64 [[X]], ptr [[PTR]] + %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 624) + %a7_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a7_ptr, i32 16 + %a7 = load i64, ptr addrspace(2) %a7_gep, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store i64 %a7, ptr %a7.i, align 8 + + ;; a8[1] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 42) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 80 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %a8_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 656) + %a8_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %a8_ptr, i32 16 + %a8 = load i32, ptr addrspace(2) %a8_gep, align 4, !range !0, !noundef !1 + %a8.i = getelementptr inbounds nuw i8, ptr %dst, i32 80 + store i32 %a8, ptr %a8.i, align 4 + + ret void +} + +!0 = !{i32 0, i32 2} +!1 = !{} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll new file mode 100644 index 0000000000000..22994cfc3f48a --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic-struct.ll @@ -0,0 +1,64 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for indexed types in dynamically indexed arrays in cbuffers. +; +; Bug https://github.com/llvm/llvm-project/issues/164517 +; XFAIL: * +; +; struct S { +; float x[2]; +; uint q; +; }; +; cbuffer CB : register(b0) { +; uint32_t3 w[3]; // offset 0, size 12 (+4) * 3 +; S v[3]; // offset 48, size 24 (+8) * 3 +; } +%S = type <{ <{ [1 x <{ float, target("dx.Padding", 12) }>], float }>, i32 }> +%__cblayout_CB = type <{ + <{ + [2 x <{ <3 x i32>, target("dx.Padding", 4) }>], + <3 x i32> + }>, + target("dx.Padding", 4), + <{ + [2 x <{ %S, target("dx.Padding", 8) }>], %S + }> +}> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; w[idx].z + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: store i32 [[X]], ptr %dst + %w_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %w_arrayidx = getelementptr <3 x i32>, ptr addrspace(2) %w_ptr, i32 %idx + %w_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %w_arrayidx, i32 4 + %w_load = load i32, ptr addrspace(2) %w_gep, align 4 + store i32 %w_load, ptr %dst, align 4 + + ;; v[idx].q + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %v_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %v_arrayidx = getelementptr <{ %struct.S, target("dx.Padding", 4) }>, ptr addrspace(2) %v_ptr, i32 %idx + %v_gep = getelementptr inbounds nuw i8, ptr addrspace(2) %v_arrayidx, i32 8 + %v_load = load i32, ptr addrspace(2) %v_gep, align 4 + %v.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %v_load, ptr %v.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll new file mode 100644 index 0000000000000..7daebaed70442 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-dynamic.ll @@ -0,0 +1,46 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s +; +; Tests for dynamic indices into arrays in cbuffers. + +; cbuffer CB : register(b0) { +; uint s[10]; // offset 0, size 4 (+12) * 10 +; uint t[12]; // offset 160, size 4 (+12) * 12 +; } +%__cblayout_CB = type <{ <{ [9 x <{ i32, target("dx.Padding", 12) }>], i32 }>, target("dx.Padding", 12), <{ [11 x <{ i32, target("dx.Padding", 12) }>], i32 }> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst, i32 %idx) { +entry: + %CB.cb_h = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefromimplicitbinding(i32 1, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 4 + + ;; s[idx] + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 %idx) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: store i32 [[X]], ptr %dst + %s_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %s_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %s_ptr, i32 %idx + %s_load = load i32, ptr addrspace(2) %s_gep, align 4 + store i32 %s_load, ptr %dst, align 4 + + ;; t[idx] + ; + ; CHECK: [[T_IDX:%.*]] = add i32 10, %idx + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 [[T_IDX]]) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[X]], ptr [[PTR]] + %t_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 160) + %t_gep = getelementptr <{ i32, target("dx.Padding", 12) }>, ptr addrspace(2) %t_ptr, i32 %idx + %t_load = load i32, ptr addrspace(2) %t_gep, align 4 + %t.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %t_load, ptr %t.i, align 4 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll new file mode 100644 index 0000000000000..65c9a3ec966e9 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-scalars.ll @@ -0,0 +1,101 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB { +; float a1; // offset 0, size 4 +; int a2; // offset 4, size 4 +; bool a3; // offset 8, size 4 +; float16_t a4; // offset 12, size 2 +; uint16_t a5; // offset 14, size 2 +; double a6; // offset 16, size 8 +; int64_t a7; // offset 24, size 8 +; } +%__cblayout_CB = type <{ float, i32, i32, half, i16, double, i64 }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + + ;; a1 + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A1:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: store float [[A1]], ptr %dst + %a1_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1 = load float, ptr addrspace(2) %a1_ptr, align 4 + store float %a1, ptr %dst, align 8 + + ;; a2 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A2:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 4 + ; CHECK: store i32 [[A2]], ptr [[PTR]] + %a2_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 4) + %a2 = load i32, ptr addrspace(2) %a2_ptr, align 4 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 4 + store i32 %a2, ptr %a2.i, align 8 + + ;; a3 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A3:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 8 + ; CHECK: store i32 [[A3]], ptr [[PTR]] + %a3_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 8) + %a3 = load i32, ptr addrspace(2) %a3_ptr, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 8 + store i32 %a3, ptr %a3.i, align 4 + + ;; a4 + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A4:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 6 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 12 + ; CHECK: store half [[A4]], ptr [[PTR]] + %a4_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 12) + %a4 = load half, ptr addrspace(2) %a4_ptr, align 2 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 12 + store half %a4, ptr %a4.i, align 4 + + ;; a5 + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[A5:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 7 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 14 + ; CHECK: store i16 [[A5]], ptr [[PTR]] + %a5_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 14) + %a5 = load i16, ptr addrspace(2) %a5_ptr, align 2 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 14 + store i16 %a5, ptr %a5.i, align 2 + + ;; a6 + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[A6:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store double [[A6]], ptr [[PTR]] + %a6_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16) + %a6 = load double, ptr addrspace(2) %a6_ptr, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store double %a6, ptr %a6.i, align 8 + + ;; a7 + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[A7:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 24 + ; CHECK: store i64 [[A7]], ptr [[PTR]] + %a7_ptr = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 24) + %a7 = load i64, ptr addrspace(2) %a7_ptr, align 8 + %a7.i = getelementptr inbounds nuw i8, ptr %dst, i32 24 + store i64 %a7, ptr %a7.i, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll new file mode 100644 index 0000000000000..0156a1a0472ab --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load-cbuffer-vectors.ll @@ -0,0 +1,121 @@ +; RUN: opt -S -dxil-resource-access -mtriple=dxil %s | FileCheck %s + +; cbuffer CB { +; float3 a1; // offset 0, size 12 (+4) +; double3 a2; // offset 16, size 24 +; float16_t2 a3; // offset 40, size 4 (+4) +; uint64_t3 a4; // offset 48, size 24 (+8) +; int4 a5; // offset 80, size 16 +; uint16_t3 a6; // offset 96, size 6 +; }; +%__cblayout_CB = type <{ <3 x float>, target("dx.Padding", 4), <3 x double>, <2 x half>, target("dx.Padding", 4), <3 x i64>, target("dx.Padding", 8), <4 x i32>, <3 x i16> }> + +@CB.cb = local_unnamed_addr global target("dx.CBuffer", %__cblayout_CB) poison + +; CHECK: define void @f +define void @f(ptr %dst) { +entry: + %CB.cb_h.i.i = tail call target("dx.CBuffer", %__cblayout_CB) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr null) + store target("dx.CBuffer", %__cblayout_CB) %CB.cb_h.i.i, ptr @CB.cb, align 4 + + ; CHECK: [[CB:%.*]] = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb + %CB.cb = load target("dx.CBuffer", %__cblayout_CB), ptr @CB.cb, align 8 + + ;; a1 + ; + ; CHECK: [[LOAD:%.*]] = call { float, float, float, float } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 0) + ; CHECK: [[X:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { float, float, float, float } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x float> poison, float [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x float> [[VEC0]], float [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x float> [[VEC1]], float [[Z]], i32 2 + ; CHECK: store <3 x float> [[VEC2]], ptr %dst + %a1_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 0) + %a1 = load <3 x float>, ptr addrspace(2) %a1_gep, align 16 + store <3 x float> %a1, ptr %dst, align 4 + + ;; a2 + ; + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 1) + ; CHECK: [[X:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { double, double } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { double, double } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[Z:%.*]] = extractvalue { double, double } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x double> poison, double [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x double> [[VEC0]], double [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x double> [[VEC1]], double [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 16 + ; CHECK: store <3 x double> [[VEC2]], ptr [[PTR]] + %a2_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 16) + %a2 = load <3 x double>, ptr addrspace(2) %a2_gep, align 32 + %a2.i = getelementptr inbounds nuw i8, ptr %dst, i32 16 + store <3 x double> %a2, ptr %a2.i, align 8 + + ;; a3 + ; + ; CHECK: [[LOAD:%.*]] = call { half, half, half, half, half, half, half, half } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 2) + ; CHECK: [[X:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 4 + ; CHECK: [[Y:%.*]] = extractvalue { half, half, half, half, half, half, half, half } [[LOAD]], 5 + ; CHECK: [[VEC0:%.*]] = insertelement <2 x half> poison, half [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <2 x half> [[VEC0]], half [[Y]], i32 1 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 40 + ; CHECK: store <2 x half> [[VEC1]], ptr [[PTR]] + %a3_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 40) + %a3 = load <2 x half>, ptr addrspace(2) %a3_gep, align 4 + %a3.i = getelementptr inbounds nuw i8, ptr %dst, i32 40 + store <2 x half> %a3, ptr %a3.i, align 2 + + ;; a4 + ; + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 3) + ; CHECK: [[X:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i64, i64 } [[LOAD]], 1 + ; CHECK: [[LOAD:%.*]] = call { i64, i64 } @llvm.dx.resource.load.cbufferrow.2.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 4) + ; CHECK: [[Z:%.*]] = extractvalue { i64, i64 } [[LOAD]], 0 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i64> poison, i64 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i64> [[VEC0]], i64 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i64> [[VEC1]], i64 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 48 + ; CHECK: store <3 x i64> [[VEC2]], ptr [[PTR]] + %a4_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 48) + %a4 = load <3 x i64>, ptr addrspace(2) %a4_gep, align 32 + %a4.i = getelementptr inbounds nuw i8, ptr %dst, i32 48 + store <3 x i64> %a4, ptr %a4.i, align 8 + + ;; a5 + ; + ; CHECK: [[LOAD:%.*]] = call { i32, i32, i32, i32 } @llvm.dx.resource.load.cbufferrow.4.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 5) + ; CHECK: [[X:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 2 + ; CHECK: [[A:%.*]] = extractvalue { i32, i32, i32, i32 } [[LOAD]], 3 + ; CHECK: [[VEC0:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <4 x i32> [[VEC0]], i32 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[Z]], i32 2 + ; CHECK: [[VEC3:%.*]] = insertelement <4 x i32> [[VEC2]], i32 [[A]], i32 3 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 72 + ; CHECK: store <4 x i32> [[VEC3]], ptr [[PTR]] + %a5_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 80) + %a5 = load <4 x i32>, ptr addrspace(2) %a5_gep, align 16 + %a5.i = getelementptr inbounds nuw i8, ptr %dst, i32 72 + store <4 x i32> %a5, ptr %a5.i, align 4 + + ;; a6 + ; + ; CHECK: [[LOAD:%.*]] = call { i16, i16, i16, i16, i16, i16, i16, i16 } @llvm.dx.resource.load.cbufferrow.8.{{.*}}(target("dx.CBuffer", %__cblayout_CB) [[CB]], i32 6) + ; CHECK: [[X:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 0 + ; CHECK: [[Y:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 1 + ; CHECK: [[Z:%.*]] = extractvalue { i16, i16, i16, i16, i16, i16, i16, i16 } [[LOAD]], 2 + ; CHECK: [[VEC0:%.*]] = insertelement <3 x i16> poison, i16 [[X]], i32 0 + ; CHECK: [[VEC1:%.*]] = insertelement <3 x i16> [[VEC0]], i16 [[Y]], i32 1 + ; CHECK: [[VEC2:%.*]] = insertelement <3 x i16> [[VEC1]], i16 [[Z]], i32 2 + ; CHECK: [[PTR:%.*]] = getelementptr inbounds nuw i8, ptr %dst, i32 88 + ; CHECK: store <3 x i16> [[VEC2]], ptr [[PTR]] + %a6_gep = call ptr addrspace(2) @llvm.dx.resource.getpointer(target("dx.CBuffer", %__cblayout_CB) %CB.cb, i32 96) + %a6 = load <3 x i16>, ptr addrspace(2) %a6_gep, align 8 + %a6.i = getelementptr inbounds nuw i8, ptr %dst, i32 88 + store <3 x i16> %a6, ptr %a6.i, align 2 + + ret void +} diff --git a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll index 620b2acc49520..1c1965d44541f 100644 --- a/llvm/test/CodeGen/Hexagon/instrprof-custom.ll +++ b/llvm/test/CodeGen/Hexagon/instrprof-custom.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=hexagon -relocation-model=pic < %s | FileCheck %s -; RUN: llc -mtriple=hexagon < %s | FileCheck %s +; RUN: llc -mtriple=hexagon --mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp < %s | FileCheck %s ; CHECK-LABEL: test1: ; CHECK: {{call my_instrprof_handler|r0 = #999}} @@ -14,7 +14,4 @@ entry: } ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn -declare void @llvm.hexagon.instrprof.custom(ptr, i32) #1 - -attributes #0 = { "target-features"="+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp,+hmxv68" } -attributes #1 = { inaccessiblememonly nofree nosync nounwind willreturn } +declare void @llvm.hexagon.instrprof.custom(ptr, i32) diff --git a/llvm/test/CodeGen/Hexagon/late_instr.ll b/llvm/test/CodeGen/Hexagon/late_instr.ll index 93e5a7dba4b3b..6bd1261ed83d5 100644 --- a/llvm/test/CodeGen/Hexagon/late_instr.ll +++ b/llvm/test/CodeGen/Hexagon/late_instr.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -disable-hsdr < %s | FileCheck %s +; RUN: llc -mtriple=hexagon -disable-hsdr -terminal-rule=0 < %s | FileCheck %s ; Check if instruction vandqrt.acc and its predecessor are scheduled in consecutive packets. ; CHECK: or(q{{[0-3]+}},q{{[0-3]+}}) diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll index 6993bd672c01a..f2beadfbfa64b 100644 --- a/llvm/test/CodeGen/Hexagon/swp-carried-1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-carried-1.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 -disable-cgp-delete-phis < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-setupcost-depth-limit=1 -disable-cgp-delete-phis < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; Test that we generate the correct code when a loop carried value ; is scheduled one stage earlier than it's use. The code in diff --git a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll index 006a8b6bfc94a..69b89a680ff5a 100644 --- a/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll +++ b/llvm/test/CodeGen/Hexagon/swp-conv3x3-nested.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; This version of the conv3x3 test has both loops. This test checks that the ; inner loop has 14 packets. diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll index d1b9c51c45a2d..0466b6df46142 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi11.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s +; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv55 -hexagon-initial-cfg-cleanup=0 -terminal-rule=0 < %s | FileCheck %s ; Test that the pipeliner correctly generates the operands in the ; epilog. diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll index ba479b696f16c..c6631bd9dc16d 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi12.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true -disable-cgp-delete-phis < %s | FileCheck %s +; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-experimental-cg=true -disable-cgp-delete-phis -terminal-rule=0 < %s | FileCheck %s ; Test epilogue generation when reading loop-carried dependency from a previous ; stage. The first epilogue should read value from iteration N-1 of the kernel. diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll index 96a38939dc50e..d90e7c4cde1ca 100644 --- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll +++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi7.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -O2 -enable-pipeliner -disable-block-placement=0 < %s | FileCheck %s +; RUN: llc -mtriple=hexagon -O2 -enable-pipeliner -disable-block-placement=0 -terminal-rule=0 < %s | FileCheck %s ; For the Phis generated in the epilog, test that we generate the correct ; names for the values coming from the prolog stages. The test belows diff --git a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll index 6ca8e94200b7d..2a428ff941a71 100644 --- a/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll +++ b/llvm/test/CodeGen/Hexagon/swp-kernel-phi1.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon -enable-pipeliner-opt-size -hexagon-initial-cfg-cleanup=0 -terminal-rule=0 < %s -pipeliner-experimental-cg=true | FileCheck %s ; Test that we generate the correct names for the phis in the kernel for the ; incoming values. In this case, the loop contains a phi and has another phi diff --git a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll index 42efe60b96d48..a0aeb80a5fa93 100644 --- a/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll +++ b/llvm/test/CodeGen/Hexagon/swp-matmul-bitext.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 -enable-pipeliner < %s | FileCheck %s +; RUN: llc -mtriple=hexagon -mcpu=hexagonv60 -enable-pipeliner -terminal-rule=0 < %s | FileCheck %s ; From coremark. Test that we pipeline the matrix multiplication bitextract ; function. The pipelined code should have two packets. diff --git a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll index 1c9cc4a1cf9d8..bbaa8cd635f3e 100644 --- a/llvm/test/CodeGen/Hexagon/swp-order-copies.ll +++ b/llvm/test/CodeGen/Hexagon/swp-order-copies.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; Test that the instruction ordering code in the pipeliner fixes up dependences ; between post-increment register definitions and uses so that the register diff --git a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll index 5f1780fce39d2..38893de0b0829 100644 --- a/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll +++ b/llvm/test/CodeGen/Hexagon/swp-order-deps7.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; Test that the pipeliner cause an assert and correctly pipelines the ; loop. diff --git a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll index 6c8b0638ae5d1..5189812d522c6 100644 --- a/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll +++ b/llvm/test/CodeGen/Hexagon/swp-reuse-phi-6.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -mtriple=hexagon < %s -pipeliner-experimental-cg=true -terminal-rule=0 | FileCheck %s ; Test that the pipeliner generates correct code when attempting to reuse ; an existing phi. This test case contains a phi that references another diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll index 02aeebdeb3775..2aec92eca145f 100644 --- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll +++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll @@ -127,13 +127,11 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { ; RV32-NEXT: .LBB3_2: # %while.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: lw a3, 0(a1) -; RV32-NEXT: addi a4, a1, 4 +; RV32-NEXT: addi a1, a1, 4 ; RV32-NEXT: slli a3, a3, 1 -; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: mv a1, a4 -; RV32-NEXT: bne a4, a2, .LBB3_2 +; RV32-NEXT: addi a0, a0, 4 +; RV32-NEXT: bne a1, a2, .LBB3_2 ; RV32-NEXT: .LBB3_3: # %while.end ; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret @@ -151,13 +149,11 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { ; RV64-NEXT: .LBB3_2: # %while.body ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64-NEXT: lw a3, 0(a1) -; RV64-NEXT: addi a4, a1, 4 +; RV64-NEXT: addi a1, a1, 4 ; RV64-NEXT: slli a3, a3, 1 -; RV64-NEXT: addi a1, a0, 4 ; RV64-NEXT: sw a3, 0(a0) -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: mv a1, a4 -; RV64-NEXT: bne a4, a2, .LBB3_2 +; RV64-NEXT: addi a0, a0, 4 +; RV64-NEXT: bne a1, a2, .LBB3_2 ; RV64-NEXT: .LBB3_3: # %while.end ; RV64-NEXT: li a0, 0 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll index d250098576687..a2a7da7e2d6ef 100644 --- a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll +++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll @@ -54,37 +54,37 @@ define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cn ; CHECK-PIPELINED: # %bb.0: # %entry ; CHECK-PIPELINED-NEXT: blez a2, .LBB1_6 ; CHECK-PIPELINED-NEXT: # %bb.1: # %for.body.preheader -; CHECK-PIPELINED-NEXT: lw a4, 0(a1) +; CHECK-PIPELINED-NEXT: lw a7, 0(a1) ; CHECK-PIPELINED-NEXT: addi a2, a2, -1 +; CHECK-PIPELINED-NEXT: addi a3, a0, 4 +; CHECK-PIPELINED-NEXT: addi a5, a1, 4 ; CHECK-PIPELINED-NEXT: sh2add.uw a6, a2, a1 -; CHECK-PIPELINED-NEXT: addi a2, a0, 4 -; CHECK-PIPELINED-NEXT: addi a1, a1, 4 ; CHECK-PIPELINED-NEXT: addi a6, a6, 4 -; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB1_5 +; CHECK-PIPELINED-NEXT: beq a5, a6, .LBB1_5 ; CHECK-PIPELINED-NEXT: # %bb.2: # %for.body -; CHECK-PIPELINED-NEXT: lw a5, 0(a1) -; CHECK-PIPELINED-NEXT: addi a3, a2, 4 -; CHECK-PIPELINED-NEXT: addi a4, a4, 1 -; CHECK-PIPELINED-NEXT: addi a1, a1, 4 -; CHECK-PIPELINED-NEXT: beq a1, a6, .LBB1_4 +; CHECK-PIPELINED-NEXT: lw a1, 0(a5) +; CHECK-PIPELINED-NEXT: addi a4, a3, 4 +; CHECK-PIPELINED-NEXT: addi a5, a5, 4 +; CHECK-PIPELINED-NEXT: beq a5, a6, .LBB1_4 ; CHECK-PIPELINED-NEXT: .LBB1_3: # %for.body ; CHECK-PIPELINED-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-PIPELINED-NEXT: sw a4, 0(a0) -; CHECK-PIPELINED-NEXT: mv a4, a5 -; CHECK-PIPELINED-NEXT: lw a5, 0(a1) -; CHECK-PIPELINED-NEXT: mv a0, a2 -; CHECK-PIPELINED-NEXT: mv a2, a3 -; CHECK-PIPELINED-NEXT: addi a3, a3, 4 -; CHECK-PIPELINED-NEXT: addi a4, a4, 1 -; CHECK-PIPELINED-NEXT: addi a1, a1, 4 -; CHECK-PIPELINED-NEXT: bne a1, a6, .LBB1_3 +; CHECK-PIPELINED-NEXT: addi a2, a7, 1 +; CHECK-PIPELINED-NEXT: mv a7, a1 +; CHECK-PIPELINED-NEXT: lw a1, 0(a5) +; CHECK-PIPELINED-NEXT: sw a2, 0(a0) +; CHECK-PIPELINED-NEXT: mv a0, a3 +; CHECK-PIPELINED-NEXT: mv a3, a4 +; CHECK-PIPELINED-NEXT: addi a4, a4, 4 +; CHECK-PIPELINED-NEXT: addi a5, a5, 4 +; CHECK-PIPELINED-NEXT: bne a5, a6, .LBB1_3 ; CHECK-PIPELINED-NEXT: .LBB1_4: -; CHECK-PIPELINED-NEXT: sw a4, 0(a0) -; CHECK-PIPELINED-NEXT: mv a0, a2 -; CHECK-PIPELINED-NEXT: mv a4, a5 +; CHECK-PIPELINED-NEXT: addi a7, a7, 1 +; CHECK-PIPELINED-NEXT: sw a7, 0(a0) +; CHECK-PIPELINED-NEXT: mv a0, a3 +; CHECK-PIPELINED-NEXT: mv a7, a1 ; CHECK-PIPELINED-NEXT: .LBB1_5: -; CHECK-PIPELINED-NEXT: addi a4, a4, 1 -; CHECK-PIPELINED-NEXT: sw a4, 0(a0) +; CHECK-PIPELINED-NEXT: addi a7, a7, 1 +; CHECK-PIPELINED-NEXT: sw a7, 0(a0) ; CHECK-PIPELINED-NEXT: .LBB1_6: # %for.end ; CHECK-PIPELINED-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll new file mode 100644 index 0000000000000..46d5e9f9a538f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll @@ -0,0 +1,515 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECK-RV64 %s + +; Test basic add/sub operations for v2i16 +define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: padd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = add <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = sub <2 x i16> %a, %b + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test basic add/sub operations for v4i8 +define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: padd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = add <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = sub <4 x i8> %a, %b + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v2i16 +define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psadd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psaddu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v2i16 +define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssubu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %res = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %a, <2 x i16> %b) + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v4i8 +define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psadd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: psaddu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v4i8 +define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pssubu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %res = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %a, <4 x i8> %b) + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v2i16 +define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paadd.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %ext.a = sext <2 x i16> %a to <2 x i32> + %ext.b = sext <2 x i16> %b to <2 x i32> + %add = add nsw <2 x i32> %ext.a, %ext.b + %shift = ashr <2 x i32> %add, + %res = trunc <2 x i32> %shift to <2 x i16> + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v2i16 +define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paaddu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %and = and <2 x i16> %a, %b + %xor = xor <2 x i16> %a, %b + %shift = lshr <2 x i16> %xor, + %res = add <2 x i16> %and, %shift + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v4i8 +define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paadd.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %ext.a = sext <4 x i8> %a to <4 x i16> + %ext.b = sext <4 x i8> %b to <4 x i16> + %add = add nsw <4 x i16> %ext.a, %ext.b + %shift = ashr <4 x i16> %add, + %res = trunc <4 x i16> %shift to <4 x i8> + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v4i8 +define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: paaddu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %and = and <4 x i8> %a, %b + %xor = xor <4 x i8> %a, %b + %shift = lshr <4 x i8> %xor, + %res = add <4 x i8> %and, %shift + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v2i16 +define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdif.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %min = call <2 x i16> @llvm.smin.v2i16(<2 x i16> %a, <2 x i16> %b) + %max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %a, <2 x i16> %b) + %res = sub <2 x i16> %max, %min + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v2i16 +define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdifu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %min = call <2 x i16> @llvm.umin.v2i16(<2 x i16> %a, <2 x i16> %b) + %max = call <2 x i16> @llvm.umax.v2i16(<2 x i16> %a, <2 x i16> %b) + %res = sub <2 x i16> %max, %min + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v4i8 +define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdif.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %min = call <4 x i8> @llvm.smin.v4i8(<4 x i8> %a, <4 x i8> %b) + %max = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %a, <4 x i8> %b) + %res = sub <4 x i8> %max, %min + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v4i8 +define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pdifu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %min = call <4 x i8> @llvm.umin.v4i8(<4 x i8> %a, <4 x i8> %b) + %max = call <4 x i8> @llvm.umax.v4i8(<4 x i8> %a, <4 x i8> %b) + %res = sub <4 x i8> %max, %min + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v2i16 +; pasub pattern: (a - b) arithmetic shift right 1 +define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasub.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %a_ext = sext <2 x i16> %a to <2 x i32> + %b_ext = sext <2 x i16> %b to <2 x i32> + %sub = sub <2 x i32> %a_ext, %b_ext + %res = ashr <2 x i32> %sub, + %res_trunc = trunc <2 x i32> %res to <2 x i16> + store <2 x i16> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v2i16 +; pasubu pattern: (a - b) logical shift right 1 +define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasubu.h a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %b = load <2 x i16>, ptr %b_ptr + %a_ext = zext <2 x i16> %a to <2 x i32> + %b_ext = zext <2 x i16> %b to <2 x i32> + %sub = sub <2 x i32> %a_ext, %b_ext + %res = lshr <2 x i32> %sub, + %res_trunc = trunc <2 x i32> %res to <2 x i16> + store <2 x i16> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v4i8 +define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasub.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %a_ext = sext <4 x i8> %a to <4 x i16> + %b_ext = sext <4 x i8> %b to <4 x i16> + %sub = sub <4 x i16> %a_ext, %b_ext + %res = ashr <4 x i16> %sub, + %res_trunc = trunc <4 x i16> %res to <4 x i8> + store <4 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v4i8 +define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: lw a2, 0(a2) +; CHECK-NEXT: pasubu.b a1, a1, a2 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %b = load <4 x i8>, ptr %b_ptr + %a_ext = zext <4 x i8> %a to <4 x i16> + %b_ext = zext <4 x i8> %b to <4 x i16> + %sub = sub <4 x i16> %a_ext, %b_ext + %res = lshr <4 x i16> %sub, + %res_trunc = trunc <4 x i16> %res to <4 x i8> + store <4 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test PLI (pack load immediate) for v2i16 +define void @test_pli_h(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.h a1, 42 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %res = add <2 x i16> , + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pli_h_negative(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_h_negative: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.h a1, -5 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %res = add <2 x i16> , + store <2 x i16> %res, ptr %ret_ptr + ret void +} + +; Test PLI for v4i8 with unsigned immediate +define void @test_pli_b(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_b: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.b a1, 32 +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %res = add <4 x i8> , + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pli_b_negative(ptr %ret_ptr) { +; CHECK-RV32-LABEL: test_pli_b_negative: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: pli.b a1, -2 +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64-LABEL: test_pli_b_negative: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: pli.h a1, -258 +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret + %res = add <4 x i8> , + store <4 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_16(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_16: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: sh a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i16>, ptr %a_ptr + %extracted = extractelement <2 x i16> %a, i32 0 + store i16 %extracted, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_8(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_8: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a1, 0(a1) +; CHECK-NEXT: sb a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i8>, ptr %a_ptr + %extracted = extractelement <4 x i8> %a, i32 0 + store i8 %extracted, ptr %ret_ptr + ret void +} + +; Intrinsic declarations +declare <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) +declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>) +declare <4 x i8> @llvm.smin.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.smax.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) +declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll new file mode 100644 index 0000000000000..000a95fb6e0f8 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll @@ -0,0 +1,514 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -enable-p-ext-codegen -verify-machineinstrs < %s | FileCheck %s + +; Test basic add/sub operations for v4i16 +define void @test_padd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: padd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = add <4 x i16> %a, %b + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = sub <4 x i16> %a, %b + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test basic add/sub operations for v8i8 +define void @test_padd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_padd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: padd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = add <8 x i8> %a, %b + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = sub <8 x i8> %a, %b + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v4i16 +define void @test_psadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psadd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psaddu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v4i16 +define void @test_pssub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssubu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %res = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test saturating add operations for v8i8 +define void @test_psadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psadd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_psaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_psaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: psaddu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test saturating sub operations for v8i8 +define void @test_pssub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +define void @test_pssubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pssubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pssubu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %res = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v4i16 +; avgfloors pattern: (a + b) arithmetic shift right 1 +define void @test_paadd_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paadd.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %ext.a = sext <4 x i16> %a to <4 x i32> + %ext.b = sext <4 x i16> %b to <4 x i32> + %add = add nsw <4 x i32> %ext.a, %ext.b + %shift = ashr <4 x i32> %add, + %res = trunc <4 x i32> %shift to <4 x i16> + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v4i16 +; avgflooru pattern: (a & b) + ((a ^ b) >> 1) +define void @test_paaddu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paaddu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %and = and <4 x i16> %a, %b + %xor = xor <4 x i16> %a, %b + %shift = lshr <4 x i16> %xor, + %res = add <4 x i16> %and, %shift + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor signed operations for v8i8 +define void @test_paadd_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paadd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paadd.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %ext.a = sext <8 x i8> %a to <8 x i16> + %ext.b = sext <8 x i8> %b to <8 x i16> + %add = add nsw <8 x i16> %ext.a, %ext.b + %shift = ashr <8 x i16> %add, + %res = trunc <8 x i16> %shift to <8 x i8> + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor unsigned operations for v8i8 +define void @test_paaddu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_paaddu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: paaddu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %and = and <8 x i8> %a, %b + %xor = xor <8 x i8> %a, %b + %shift = lshr <8 x i8> %xor, + %res = add <8 x i8> %and, %shift + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v4i16 +; abds pattern: sub(smax(a,b), smin(a,b)) +define void @test_pdif_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdif.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %min = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %a, <4 x i16> %b) + %max = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %a, <4 x i16> %b) + %res = sub <4 x i16> %max, %min + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v4i16 +; abdu pattern: sub(umax(a,b), umin(a,b)) +define void @test_pdifu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdifu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %min = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %a, <4 x i16> %b) + %max = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %a, <4 x i16> %b) + %res = sub <4 x i16> %max, %min + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference signed for v8i8 +define void @test_pdif_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdif_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdif.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %min = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %a, <8 x i8> %b) + %max = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %a, <8 x i8> %b) + %res = sub <8 x i8> %max, %min + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test absolute difference unsigned for v8i8 +define void @test_pdifu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pdifu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pdifu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %min = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %a, <8 x i8> %b) + %max = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b) + %res = sub <8 x i8> %max, %min + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v4i16 +; pasub pattern: (a - b) arithmetic shift right 1 +define void @test_pasub_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasub.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %a_ext = sext <4 x i16> %a to <4 x i32> + %b_ext = sext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a_ext, %b_ext + %res = ashr <4 x i32> %sub, + %res_trunc = trunc <4 x i32> %res to <4 x i16> + store <4 x i16> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v4i16 +; pasubu pattern: (a - b) logical shift right 1 +define void @test_pasubu_h(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_h: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasubu.h a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %b = load <4 x i16>, ptr %b_ptr + %a_ext = zext <4 x i16> %a to <4 x i32> + %b_ext = zext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a_ext, %b_ext + %res = lshr <4 x i32> %sub, + %res_trunc = trunc <4 x i32> %res to <4 x i16> + store <4 x i16> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction signed for v8i8 +define void @test_pasub_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasub_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasub.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %a_ext = sext <8 x i8> %a to <8 x i16> + %b_ext = sext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a_ext, %b_ext + %res = ashr <8 x i16> %sub, + %res_trunc = trunc <8 x i16> %res to <8 x i8> + store <8 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test averaging floor subtraction unsigned for v8i8 +define void @test_pasubu_b(ptr %ret_ptr, ptr %a_ptr, ptr %b_ptr) { +; CHECK-LABEL: test_pasubu_b: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: ld a2, 0(a2) +; CHECK-NEXT: pasubu.b a1, a1, a2 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %b = load <8 x i8>, ptr %b_ptr + %a_ext = zext <8 x i8> %a to <8 x i16> + %b_ext = zext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a_ext, %b_ext + %res = lshr <8 x i16> %sub, + %res_trunc = trunc <8 x i16> %res to <8 x i8> + store <8 x i8> %res_trunc, ptr %ret_ptr + ret void +} + +; Test PLI (pack load immediate) for v4i16 +define void @test_pli_h(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_h: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.h a1, 100 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <4 x i16> , + store <4 x i16> %res, ptr %ret_ptr + ret void +} + +; Test PLI for v8i8 with unsigned immediate +define void @test_pli_b(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_b: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.b a1, 64 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <8 x i8> , + store <8 x i8> %res, ptr %ret_ptr + ret void +} + +; Test PLI for v2i32 with signed immediate +define void @test_pli_w(ptr %ret_ptr) { +; CHECK-LABEL: test_pli_w: +; CHECK: # %bb.0: +; CHECK-NEXT: pli.w a1, -256 +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: ret + %res = add <2 x i32> , + store <2 x i32> %res, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_16(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_16: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: sh a1, 0(a0) +; CHECK-NEXT: ret + %a = load <4 x i16>, ptr %a_ptr + %extracted = extractelement <4 x i16> %a, i32 0 + store i16 %extracted, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_8(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_8: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: sb a1, 0(a0) +; CHECK-NEXT: ret + %a = load <8 x i8>, ptr %a_ptr + %extracted = extractelement <8 x i8> %a, i32 0 + store i8 %extracted, ptr %ret_ptr + ret void +} + +define void @test_extract_vector_32(ptr %ret_ptr, ptr %a_ptr) { +; CHECK-LABEL: test_extract_vector_32: +; CHECK: # %bb.0: +; CHECK-NEXT: ld a1, 0(a1) +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ret + %a = load <2 x i32>, ptr %a_ptr + %extracted = extractelement <2 x i32> %a, i32 0 + store i32 %extracted, ptr %ret_ptr + ret void +} + +; Intrinsic declarations +declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) +declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i8> @llvm.smin.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.smax.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.umin.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.umax.v8i8(<8 x i8>, <8 x i8>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index 9c6d77dde1b5c..c3fe6b335d3da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -44,9 +44,8 @@ define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_with_tail: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vmv1r.v v11, v9 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vrgather.vi v8, v10, 0 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> ret <4 x i64> %res @@ -99,9 +98,8 @@ define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) { ; CHECK-LABEL: m2_splat_into_identity: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vrgather.vi v10, v8, 0 -; CHECK-NEXT: vmv1r.v v11, v9 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vrgather.vi v8, v10, 0 ; CHECK-NEXT: ret %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> ret <4 x i64> %res diff --git a/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir b/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir new file mode 100644 index 0000000000000..76dfd4e746bea --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/machine-combiner-subreg-verifier-error.mir @@ -0,0 +1,39 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs -run-pass=machine-combiner -o - %s | FileCheck %s + +# Make sure the verifier doesn't fail due to dropping subregister +# uses. + +--- +name: machine_combiner_subreg_verifier_error +tracksRegLiveness: true +isSSA: true +body: | + bb.0: + liveins: $v8m4, $v12m4 + + ; CHECK-LABEL: name: machine_combiner_subreg_verifier_error + ; CHECK: liveins: $v8m4, $v12m4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vrm4 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:gprnox0 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vrm8 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vr = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vrm2 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vr = IMPLICIT_DEF + ; CHECK-NEXT: [[PseudoVSLIDEDOWN_VI_M8_:%[0-9]+]]:vrm8 = PseudoVSLIDEDOWN_VI_M8 $noreg, [[DEF2]], 26, 2, 5 /* e32 */, 3 /* ta, ma */ + ; CHECK-NEXT: [[PseudoVADD_VV_MF2_:%[0-9]+]]:vr = PseudoVADD_VV_MF2 $noreg, [[DEF2]].sub_vrm1_0, killed [[DEF3]], 2, 5 /* e32 */, 1 /* ta, mu */ + ; CHECK-NEXT: [[PseudoVADD_VV_MF2_1:%[0-9]+]]:vr = PseudoVADD_VV_MF2 $noreg, [[PseudoVSLIDEDOWN_VI_M8_]].sub_vrm1_0, killed [[PseudoVADD_VV_MF2_]], 2, 5 /* e32 */, 1 /* ta, mu */ + ; CHECK-NEXT: PseudoRET implicit $v8 + %0:vrm4 = IMPLICIT_DEF + %1:gprnox0 = IMPLICIT_DEF + %2:vrm8 = IMPLICIT_DEF + %3:vr = IMPLICIT_DEF + %4:vrm2 = IMPLICIT_DEF + %5:vr = IMPLICIT_DEF + %6:vrm8 = PseudoVSLIDEDOWN_VI_M8 $noreg, %2, 26, 2, 5 /* e32 */, 3 /* ta, ma */ + %7:vr = PseudoVADD_VV_MF2 $noreg, %6.sub_vrm1_0, %2.sub_vrm1_0, 2, 5 /* e32 */, 1 /* ta, mu */ + %8:vr = PseudoVADD_VV_MF2 $noreg, killed %7, killed %3, 2, 5 /* e32 */, 1 /* ta, mu */ + PseudoRET implicit $v8 + +... diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll index ab9849631663c..a4c793b49d54a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll @@ -36,7 +36,7 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB0_2 Depth 2 @@ -53,9 +53,9 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, This Inner Loop Header: Depth=5 -; CHECK-NEXT: addi a5, a1, 4 -; CHECK-NEXT: add a4, s8, a1 -; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: add a4, a5, a1 +; CHECK-NEXT: add a3, s6, a1 +; CHECK-NEXT: addi a1, a1, 4 ; CHECK-NEXT: vse32.v v8, (a4), v0.t -; CHECK-NEXT: vse32.v v8, (a1), v0.t -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: bne a5, s0, .LBB0_5 +; CHECK-NEXT: vse32.v v8, (a3), v0.t +; CHECK-NEXT: bne a1, s0, .LBB0_5 ; CHECK-NEXT: # %bb.6: # %for.cond.cleanup15.i ; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=4 ; CHECK-NEXT: addi s1, s1, 4 -; CHECK-NEXT: addi s8, s8, 4 +; CHECK-NEXT: addi a5, a5, 4 ; CHECK-NEXT: addi ra, ra, 4 -; CHECK-NEXT: addi a3, a3, 4 +; CHECK-NEXT: addi s6, s6, 4 ; CHECK-NEXT: andi s10, a0, 1 ; CHECK-NEXT: addi s11, s11, 4 ; CHECK-NEXT: beqz s10, .LBB0_4 ; CHECK-NEXT: # %bb.7: # %for.cond.cleanup11.i ; CHECK-NEXT: # in Loop: Header=BB0_3 Depth=3 ; CHECK-NEXT: addi s9, s9, 4 -; CHECK-NEXT: addi s3, s3, 4 +; CHECK-NEXT: addi s8, s8, 4 ; CHECK-NEXT: addi s7, s7, 4 -; CHECK-NEXT: addi s6, s6, 4 +; CHECK-NEXT: addi t5, t5, 4 ; CHECK-NEXT: andi a1, a2, 1 ; CHECK-NEXT: addi s5, s5, 4 ; CHECK-NEXT: beqz a1, .LBB0_3 ; CHECK-NEXT: # %bb.8: # %for.cond.cleanup7.i ; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=2 ; CHECK-NEXT: addi s4, s4, 4 -; CHECK-NEXT: addi a7, a7, 4 +; CHECK-NEXT: addi s3, s3, 4 ; CHECK-NEXT: addi t6, t6, 4 -; CHECK-NEXT: addi t5, t5, 4 +; CHECK-NEXT: addi t2, t2, 4 ; CHECK-NEXT: addi t4, t4, 4 ; CHECK-NEXT: beqz t3, .LBB0_2 ; CHECK-NEXT: # %bb.9: # %for.cond.cleanup3.i ; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: addi a6, a6, 4 -; CHECK-NEXT: addi s2, s2, 4 +; CHECK-NEXT: addi a7, a7, 4 ; CHECK-NEXT: addi t0, t0, 4 -; CHECK-NEXT: addi t2, t2, 4 +; CHECK-NEXT: addi s2, s2, 4 ; CHECK-NEXT: addi t1, t1, 4 ; CHECK-NEXT: beqz a1, .LBB0_1 ; CHECK-NEXT: # %bb.10: # %l.exit diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll index f295bd8d74df3..386c736128794 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll @@ -2258,18 +2258,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-RV32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-RV32-NEXT: .LBB98_3: # %vector.body ; CHECK-RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-RV32-NEXT: slli a7, a6, 2 -; CHECK-RV32-NEXT: add t0, a6, a4 -; CHECK-RV32-NEXT: add a7, a0, a7 -; CHECK-RV32-NEXT: vl2re32.v v8, (a7) -; CHECK-RV32-NEXT: sltu a6, t0, a6 -; CHECK-RV32-NEXT: add a5, a5, a6 -; CHECK-RV32-NEXT: xor a6, t0, a3 +; CHECK-RV32-NEXT: mv a7, a6 +; CHECK-RV32-NEXT: slli t0, a6, 2 +; CHECK-RV32-NEXT: add a6, a6, a4 +; CHECK-RV32-NEXT: add t0, a0, t0 +; CHECK-RV32-NEXT: vl2re32.v v8, (t0) +; CHECK-RV32-NEXT: sltu a7, a6, a7 +; CHECK-RV32-NEXT: add a5, a5, a7 +; CHECK-RV32-NEXT: xor a7, a6, a3 ; CHECK-RV32-NEXT: vand.vx v8, v8, a1 -; CHECK-RV32-NEXT: or t1, a6, a5 -; CHECK-RV32-NEXT: vs2r.v v8, (a7) -; CHECK-RV32-NEXT: mv a6, t0 -; CHECK-RV32-NEXT: bnez t1, .LBB98_3 +; CHECK-RV32-NEXT: or a7, a7, a5 +; CHECK-RV32-NEXT: vs2r.v v8, (t0) +; CHECK-RV32-NEXT: bnez a7, .LBB98_3 ; CHECK-RV32-NEXT: # %bb.4: # %middle.block ; CHECK-RV32-NEXT: bnez a3, .LBB98_6 ; CHECK-RV32-NEXT: .LBB98_5: # %for.body @@ -2350,18 +2350,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-ZVKB-NOZBB32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-ZVKB-NOZBB32-NEXT: .LBB98_3: # %vector.body ; CHECK-ZVKB-NOZBB32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ZVKB-NOZBB32-NEXT: slli a7, a6, 2 -; CHECK-ZVKB-NOZBB32-NEXT: add t0, a6, a4 -; CHECK-ZVKB-NOZBB32-NEXT: add a7, a0, a7 -; CHECK-ZVKB-NOZBB32-NEXT: vl2re32.v v8, (a7) -; CHECK-ZVKB-NOZBB32-NEXT: sltu a6, t0, a6 -; CHECK-ZVKB-NOZBB32-NEXT: add a5, a5, a6 -; CHECK-ZVKB-NOZBB32-NEXT: xor a6, t0, a3 +; CHECK-ZVKB-NOZBB32-NEXT: mv a7, a6 +; CHECK-ZVKB-NOZBB32-NEXT: slli t0, a6, 2 +; CHECK-ZVKB-NOZBB32-NEXT: add a6, a6, a4 +; CHECK-ZVKB-NOZBB32-NEXT: add t0, a0, t0 +; CHECK-ZVKB-NOZBB32-NEXT: vl2re32.v v8, (t0) +; CHECK-ZVKB-NOZBB32-NEXT: sltu a7, a6, a7 +; CHECK-ZVKB-NOZBB32-NEXT: add a5, a5, a7 +; CHECK-ZVKB-NOZBB32-NEXT: xor a7, a6, a3 ; CHECK-ZVKB-NOZBB32-NEXT: vandn.vx v8, v8, a1 -; CHECK-ZVKB-NOZBB32-NEXT: or t1, a6, a5 -; CHECK-ZVKB-NOZBB32-NEXT: vs2r.v v8, (a7) -; CHECK-ZVKB-NOZBB32-NEXT: mv a6, t0 -; CHECK-ZVKB-NOZBB32-NEXT: bnez t1, .LBB98_3 +; CHECK-ZVKB-NOZBB32-NEXT: or a7, a7, a5 +; CHECK-ZVKB-NOZBB32-NEXT: vs2r.v v8, (t0) +; CHECK-ZVKB-NOZBB32-NEXT: bnez a7, .LBB98_3 ; CHECK-ZVKB-NOZBB32-NEXT: # %bb.4: # %middle.block ; CHECK-ZVKB-NOZBB32-NEXT: bnez a3, .LBB98_7 ; CHECK-ZVKB-NOZBB32-NEXT: .LBB98_5: # %for.body.preheader @@ -2444,18 +2444,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) { ; CHECK-ZVKB-ZBB32-NEXT: vsetvli a7, zero, e32, m2, ta, ma ; CHECK-ZVKB-ZBB32-NEXT: .LBB98_3: # %vector.body ; CHECK-ZVKB-ZBB32-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-ZVKB-ZBB32-NEXT: slli a7, a6, 2 -; CHECK-ZVKB-ZBB32-NEXT: add t0, a6, a4 -; CHECK-ZVKB-ZBB32-NEXT: add a7, a0, a7 -; CHECK-ZVKB-ZBB32-NEXT: vl2re32.v v8, (a7) -; CHECK-ZVKB-ZBB32-NEXT: sltu a6, t0, a6 -; CHECK-ZVKB-ZBB32-NEXT: add a5, a5, a6 -; CHECK-ZVKB-ZBB32-NEXT: xor a6, t0, a3 +; CHECK-ZVKB-ZBB32-NEXT: mv a7, a6 +; CHECK-ZVKB-ZBB32-NEXT: slli t0, a6, 2 +; CHECK-ZVKB-ZBB32-NEXT: add a6, a6, a4 +; CHECK-ZVKB-ZBB32-NEXT: add t0, a0, t0 +; CHECK-ZVKB-ZBB32-NEXT: vl2re32.v v8, (t0) +; CHECK-ZVKB-ZBB32-NEXT: sltu a7, a6, a7 +; CHECK-ZVKB-ZBB32-NEXT: add a5, a5, a7 +; CHECK-ZVKB-ZBB32-NEXT: xor a7, a6, a3 ; CHECK-ZVKB-ZBB32-NEXT: vandn.vx v8, v8, a1 -; CHECK-ZVKB-ZBB32-NEXT: or t1, a6, a5 -; CHECK-ZVKB-ZBB32-NEXT: vs2r.v v8, (a7) -; CHECK-ZVKB-ZBB32-NEXT: mv a6, t0 -; CHECK-ZVKB-ZBB32-NEXT: bnez t1, .LBB98_3 +; CHECK-ZVKB-ZBB32-NEXT: or a7, a7, a5 +; CHECK-ZVKB-ZBB32-NEXT: vs2r.v v8, (t0) +; CHECK-ZVKB-ZBB32-NEXT: bnez a7, .LBB98_3 ; CHECK-ZVKB-ZBB32-NEXT: # %bb.4: # %middle.block ; CHECK-ZVKB-ZBB32-NEXT: bnez a3, .LBB98_6 ; CHECK-ZVKB-ZBB32-NEXT: .LBB98_5: # %for.body diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll index ed6b7f1e6efb8..10440089cff10 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll @@ -25,24 +25,24 @@ define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr noc ; RV32-NEXT: li a6, 0 ; RV32-NEXT: .LBB0_4: # %vector.body ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 -; RV32-NEXT: slli t0, a7, 2 -; RV32-NEXT: addi t1, a7, 8 -; RV32-NEXT: add t0, a1, t0 +; RV32-NEXT: mv t0, a7 +; RV32-NEXT: slli t1, a7, 2 +; RV32-NEXT: addi a7, a7, 8 +; RV32-NEXT: add t1, a1, t1 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (t0) -; RV32-NEXT: sltu a7, t1, a7 -; RV32-NEXT: xor t0, t1, a5 -; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: vle32.v v8, (t1) +; RV32-NEXT: sltu t0, a7, t0 +; RV32-NEXT: xor t1, a7, a5 +; RV32-NEXT: add a6, a6, t0 ; RV32-NEXT: vmslt.vx v12, v8, a2 ; RV32-NEXT: vcompress.vm v10, v8, v12 -; RV32-NEXT: vcpop.m a7, v12 -; RV32-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV32-NEXT: vcpop.m t0, v12 +; RV32-NEXT: vsetvli zero, t0, e32, m2, ta, ma ; RV32-NEXT: vse32.v v10, (a0) -; RV32-NEXT: slli a7, a7, 2 -; RV32-NEXT: or t0, t0, a6 -; RV32-NEXT: add a0, a0, a7 -; RV32-NEXT: mv a7, t1 -; RV32-NEXT: bnez t0, .LBB0_4 +; RV32-NEXT: slli t0, t0, 2 +; RV32-NEXT: or t1, t1, a6 +; RV32-NEXT: add a0, a0, t0 +; RV32-NEXT: bnez t1, .LBB0_4 ; RV32-NEXT: # %bb.5: # %middle.block ; RV32-NEXT: bne a5, a3, .LBB0_9 ; RV32-NEXT: .LBB0_6: # %for.cond.cleanup diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index ead79fcf53d8b..af3b0852a6461 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -102,20 +102,20 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV32-NEXT: .LBB0_13: # %vector.body ; RV32-NEXT: # Parent Loop BB0_10 Depth=1 ; RV32-NEXT: # => This Inner Loop Header: Depth=2 -; RV32-NEXT: add s0, a2, t6 -; RV32-NEXT: add s1, a4, t6 -; RV32-NEXT: vl2r.v v8, (s0) -; RV32-NEXT: add s0, a0, t6 +; RV32-NEXT: mv s0, t6 +; RV32-NEXT: add t6, a2, t6 +; RV32-NEXT: add s1, a4, s0 +; RV32-NEXT: vl2r.v v8, (t6) +; RV32-NEXT: add s2, a0, s0 ; RV32-NEXT: vl2r.v v10, (s1) -; RV32-NEXT: add s1, t6, t2 -; RV32-NEXT: sltu t6, s1, t6 -; RV32-NEXT: add t5, t5, t6 -; RV32-NEXT: xor t6, s1, t4 +; RV32-NEXT: add t6, s0, t2 +; RV32-NEXT: sltu s0, t6, s0 +; RV32-NEXT: add t5, t5, s0 +; RV32-NEXT: xor s0, t6, t4 ; RV32-NEXT: vaaddu.vv v8, v8, v10 -; RV32-NEXT: or s2, t6, t5 -; RV32-NEXT: vs2r.v v8, (s0) -; RV32-NEXT: mv t6, s1 -; RV32-NEXT: bnez s2, .LBB0_13 +; RV32-NEXT: or s0, s0, t5 +; RV32-NEXT: vs2r.v v8, (s2) +; RV32-NEXT: bnez s0, .LBB0_13 ; RV32-NEXT: # %bb.14: # %middle.block ; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 ; RV32-NEXT: beq t4, a6, .LBB0_9 diff --git a/llvm/test/CodeGen/RISCV/sra-xor-sra.ll b/llvm/test/CodeGen/RISCV/sra-xor-sra.ll new file mode 100644 index 0000000000000..b04f0a29d07f3 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/sra-xor-sra.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s + +; Test folding of: (sra (xor (sra x, c1), -1), c2) -> (sra (xor x, -1), c3) +; Original motivating example: should merge sra+sra across xor +define i16 @not_invert_signbit_splat_mask(i8 %x, i16 %y) { +; CHECK-LABEL: not_invert_signbit_splat_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: slli a0, a0, 56 +; CHECK-NEXT: srai a0, a0, 62 +; CHECK-NEXT: not a0, a0 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: ret + %a = ashr i8 %x, 6 + %n = xor i8 %a, -1 + %s = sext i8 %n to i16 + %r = and i16 %s, %y + ret i16 %r +} + +; Edge case +define i16 @sra_xor_sra_overflow(i8 %x, i16 %y) { +; CHECK-LABEL: sra_xor_sra_overflow: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: ret + %a = ashr i8 %x, 10 + %n = xor i8 %a, -1 + %s = sext i8 %n to i16 + %r = and i16 %s, %y + ret i16 %r +} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll index 79665af17ef58..9632469261f4d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/constbound.ll @@ -7,22 +7,22 @@ define dso_local i32 @test_500_504(ptr nocapture readonly %x) { ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: mov.w lr, #126 -; CHECK-NEXT: adr r2, .LCPI0_0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: mov.w r2, #500 -; CHECK-NEXT: vdup.32 q1, r2 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: adr r1, .LCPI0_0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: mov.w r1, #500 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vqadd.u32 q2, q0, r1 -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: vqadd.u32 q2, q0, r2 +; CHECK-NEXT: adds r2, #4 ; CHECK-NEXT: vptt.u32 hi, q1, q2 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 -; CHECK-NEXT: vaddvat.u32 r2, q2 +; CHECK-NEXT: vaddvat.u32 r12, q2 ; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index ec257bcf123f3..bcedcd40ba112 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -28,29 +28,29 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK-NEXT: str r6, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r7, #4 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: mov.w r10, #0 +; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2 ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r11, [r0, #16]! -; CHECK-NEXT: ldrd r5, r7, [r0, #-12] +; CHECK-NEXT: ldrd r5, r6, [r0, #-12] ; CHECK-NEXT: ldr r4, [r0, #-4] ; CHECK-NEXT: cmp r12, r5 ; CHECK-NEXT: csel r5, r5, r12, gt -; CHECK-NEXT: csinc r6, r10, r8, le -; CHECK-NEXT: cmp r5, r7 +; CHECK-NEXT: csinc r7, r10, r8, le +; CHECK-NEXT: cmp r5, r6 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r6, r8, #2 -; CHECK-NEXT: csel r7, r7, r5, gt -; CHECK-NEXT: cmp r7, r4 +; CHECK-NEXT: addgt.w r7, r8, #2 +; CHECK-NEXT: csel r6, r6, r5, gt +; CHECK-NEXT: cmp r6, r4 ; CHECK-NEXT: it gt -; CHECK-NEXT: addgt.w r6, r8, #3 -; CHECK-NEXT: csel r7, r4, r7, gt +; CHECK-NEXT: addgt.w r7, r8, #3 +; CHECK-NEXT: csel r6, r4, r6, gt ; CHECK-NEXT: add.w r8, r8, #4 -; CHECK-NEXT: cmp r7, r11 -; CHECK-NEXT: csel r10, r8, r6, gt -; CHECK-NEXT: csel r12, r11, r7, gt +; CHECK-NEXT: cmp r6, r11 +; CHECK-NEXT: csel r10, r8, r7, gt +; CHECK-NEXT: csel r12, r11, r6, gt ; CHECK-NEXT: le lr, .LBB0_5 ; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit ; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 1769c5d2fd385..98e082be4cad1 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -21,11 +21,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: it lt ; ENABLED-NEXT: bxlt lr ; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: mov r11, r0 -; ENABLED-NEXT: ldr r0, [sp, #32] +; ENABLED-NEXT: ldr r0, [sp, #36] ; ENABLED-NEXT: add.w r9, r2, #3 ; ENABLED-NEXT: mov.w r12, #0 +; ENABLED-NEXT: mov.w r8, #1 ; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 @@ -49,18 +50,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 -; ENABLED-NEXT: movs r7, #1 -; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: sub.w r4, r2, r12 +; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: vmov.i32 q1, #0x0 -; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 +; ENABLED-NEXT: mov r7, r10 +; ENABLED-NEXT: add.w r6, r8, r0, lsr #2 ; ENABLED-NEXT: adds r0, r2, #3 ; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 -; ENABLED-NEXT: mov r7, r10 -; ENABLED-NEXT: dls lr, r0 +; ENABLED-NEXT: add.w lr, r8, r0, lsr #2 ; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -83,7 +82,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 ; ENABLED-NEXT: .LBB0_8: -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: @@ -92,11 +91,12 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: it lt ; NOREDUCTIONS-NEXT: bxlt lr ; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: mov r11, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] +; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] ; NOREDUCTIONS-NEXT: add.w r9, r2, #3 ; NOREDUCTIONS-NEXT: mov.w r12, #0 +; NOREDUCTIONS-NEXT: mov.w r8, #1 ; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 @@ -120,18 +120,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 -; NOREDUCTIONS-NEXT: movs r7, #1 -; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 +; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 -; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 +; NOREDUCTIONS-NEXT: mov r7, r10 +; NOREDUCTIONS-NEXT: add.w r6, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: adds r0, r2, #3 ; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: mov r7, r10 -; NOREDUCTIONS-NEXT: dls lr, r0 +; NOREDUCTIONS-NEXT: add.w lr, r8, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 @@ -154,7 +152,7 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 ; NOREDUCTIONS-NEXT: .LBB0_8: -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll index cbcbf1f392ce8..435acc29f076e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while-loops.ll @@ -165,74 +165,73 @@ define dso_local i32 @b(ptr %c, i32 %d, i32 %e, ptr %n) "frame-pointer"="all" { ; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: wls lr, r1, .LBB2_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: adds r6, r3, #4 -; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: add.w r9, r3, #4 +; CHECK-NEXT: add.w r10, r0, #4 ; CHECK-NEXT: mvn r8, #1 -; CHECK-NEXT: @ implicit-def: $r9 +; CHECK-NEXT: @ implicit-def: $r6 ; CHECK-NEXT: @ implicit-def: $r4 ; CHECK-NEXT: str r2, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: ldr.w r1, [r10] ; CHECK-NEXT: asrs r2, r4, #31 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: muls r1, r3, r1 ; CHECK-NEXT: adds r4, r4, r1 ; CHECK-NEXT: adc.w r1, r2, r1, asr #31 ; CHECK-NEXT: adds.w r2, r4, #-2147483648 -; CHECK-NEXT: ldrd r2, r4, [r8] -; CHECK-NEXT: adc r5, r1, #0 -; CHECK-NEXT: str r2, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: smull r4, r2, r4, r9 -; CHECK-NEXT: asrs r1, r5, #31 +; CHECK-NEXT: ldrd r5, r4, [r8] +; CHECK-NEXT: adc r2, r1, #0 ; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: subs r4, r5, r4 -; CHECK-NEXT: sbcs r1, r2 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds.w r10, r4, #-2147483648 -; CHECK-NEXT: adc r1, r1, #0 -; CHECK-NEXT: ldr r4, [r2, #-4] +; CHECK-NEXT: smull r4, r5, r4, r6 +; CHECK-NEXT: asrs r1, r2, #31 +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: subs r4, r2, r4 +; CHECK-NEXT: sbcs r1, r5 +; CHECK-NEXT: adds.w r6, r4, #-2147483648 +; CHECK-NEXT: ldr r4, [r10, #-4] +; CHECK-NEXT: adc r11, r1, #0 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: add.w r10, r10, #4 ; CHECK-NEXT: muls r4, r3, r4 ; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: adds.w r12, r4, #-2147483648 ; CHECK-NEXT: asr.w r5, r4, #31 -; CHECK-NEXT: ldr r4, [r6] +; CHECK-NEXT: ldr.w r4, [r9] ; CHECK-NEXT: adc r5, r5, #0 ; CHECK-NEXT: mul r2, r4, r0 -; CHECK-NEXT: adds r0, #4 ; CHECK-NEXT: add.w r2, r2, #-2147483648 ; CHECK-NEXT: asrl r12, r5, r2 -; CHECK-NEXT: smull r2, r5, r4, r12 -; CHECK-NEXT: lsll r2, r5, #30 -; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: asr.w r11, r5, #31 -; CHECK-NEXT: mov r12, r5 -; CHECK-NEXT: lsll r12, r11, r4 -; CHECK-NEXT: mul r2, r2, r9 -; CHECK-NEXT: lsrl r12, r11, #2 -; CHECK-NEXT: adds r2, #2 -; CHECK-NEXT: lsll r12, r11, r2 +; CHECK-NEXT: smull r2, r9, r4, r12 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: lsll r2, r9, #30 +; CHECK-NEXT: asr.w r5, r9, #31 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: lsll r2, r5, r4 +; CHECK-NEXT: lsrl r2, r5, #2 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: lsll r2, r5, r0 +; CHECK-NEXT: add.w r0, r2, #-2147483648 ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r12, #-2147483648 -; CHECK-NEXT: asrl r10, r1, r5 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: lsrl r10, r1, #2 -; CHECK-NEXT: movs r1, #2 -; CHECK-NEXT: mov r9, r10 -; CHECK-NEXT: str.w r10, [r1] -; CHECK-NEXT: ldr r1, [r8], #-4 -; CHECK-NEXT: mls r5, r1, r4, r5 -; CHECK-NEXT: adds.w r4, r5, #-2147483648 -; CHECK-NEXT: asr.w r1, r5, #31 +; CHECK-NEXT: asrl r6, r11, r0 +; CHECK-NEXT: movs r0, #2 +; CHECK-NEXT: lsrl r6, r11, #2 +; CHECK-NEXT: str r6, [r0] +; CHECK-NEXT: ldr r0, [r8], #-4 +; CHECK-NEXT: mls r0, r0, r4, r1 +; CHECK-NEXT: adds.w r4, r0, #-2147483648 +; CHECK-NEXT: asr.w r1, r0, #31 ; CHECK-NEXT: adc r1, r1, #0 ; CHECK-NEXT: lsrl r4, r1, #2 -; CHECK-NEXT: rsbs r1, r4, #0 -; CHECK-NEXT: str r1, [r2] -; CHECK-NEXT: str r1, [r6, #-4] -; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: rsbs r0, r4, #0 +; CHECK-NEXT: str r0, [r2] +; CHECK-NEXT: str r0, [r9, #-4] +; CHECK-NEXT: add.w r9, r9, #4 +; CHECK-NEXT: add.w r0, r12, #4 ; CHECK-NEXT: le lr, .LBB2_2 ; CHECK-NEXT: .LBB2_3: @ %while.end ; CHECK-NEXT: add sp, #16 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index f7b4548f127bf..b6657d607ce6d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1573,120 +1573,115 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, #16 -; CHECK-NEXT: ldrd r7, r9, [r0] -; CHECK-NEXT: and r6, r3, #3 -; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: lsrs r3, r3, #2 -; CHECK-NEXT: @ implicit-def: $r12 -; CHECK-NEXT: str r6, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: ldm.w r0, {r7, r9, r11} +; CHECK-NEXT: and r0, r3, #3 +; CHECK-NEXT: @ implicit-def: $r5 +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: lsrs r0, r3, #2 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: b .LBB19_3 ; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r2, r5 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r8, r10 +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r12, r10 ; CHECK-NEXT: .LBB19_2: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r0, #128 -; CHECK-NEXT: strd r2, r4, [r9] -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: subs r7, #1 -; CHECK-NEXT: strd r3, r8, [r9, #8] -; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: add.w r11, r11, #128 +; CHECK-NEXT: strd r8, r0, [r9] ; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: strd r3, r12, [r9, #8] +; CHECK-NEXT: add.w r9, r9, #16 +; CHECK-NEXT: subs r7, #1 ; CHECK-NEXT: beq.w .LBB19_13 ; CHECK-NEXT: .LBB19_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 -; CHECK-NEXT: ldrd r5, r11, [r9] +; CHECK-NEXT: ldr.w r10, [r9, #12] ; CHECK-NEXT: mov r6, r2 -; CHECK-NEXT: ldrd r8, r10, [r9, #8] -; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload +; CHECK-NEXT: ldm.w r9, {r3, r4, r12} +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: wls lr, r2, .LBB19_6 +; CHECK-NEXT: wls lr, r0, .LBB19_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: .LBB19_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r5, [r1, #12] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: ldm.w r1, {r2, r7, r11} -; CHECK-NEXT: vmul.f32 q2, q2, r5 -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vfma.f32 q2, q6, r11 -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: mov r8, r4 +; CHECK-NEXT: ldrd r4, r3, [r1, #8] +; CHECK-NEXT: vldrw.u32 q2, [r11] +; CHECK-NEXT: vldrw.u32 q6, [r11, #16] +; CHECK-NEXT: ldrd r0, r7, [r1] +; CHECK-NEXT: vmul.f32 q2, q2, r3 +; CHECK-NEXT: vldrw.u32 q7, [r11, #32] +; CHECK-NEXT: vfma.f32 q2, q6, r4 +; CHECK-NEXT: vldrw.u32 q4, [r11, #48] ; CHECK-NEXT: vfma.f32 q2, q7, r7 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q2, q4, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q2, q5, r3 -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] -; CHECK-NEXT: vfma.f32 q2, q3, r4 -; CHECK-NEXT: vldrw.u32 q0, [r0, #112] -; CHECK-NEXT: vfma.f32 q2, q1, r8 +; CHECK-NEXT: vldrw.u32 q5, [r11, #64] +; CHECK-NEXT: vfma.f32 q2, q4, r0 +; CHECK-NEXT: vldrw.u32 q3, [r11, #80] +; CHECK-NEXT: vfma.f32 q2, q5, r5 +; CHECK-NEXT: vldrw.u32 q1, [r11, #96] +; CHECK-NEXT: vfma.f32 q2, q3, r8 +; CHECK-NEXT: vldrw.u32 q0, [r11, #112] +; CHECK-NEXT: vfma.f32 q2, q1, r12 ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: vfma.f32 q2, q0, r10 -; CHECK-NEXT: mov r4, r11 -; CHECK-NEXT: vmov r10, r8, d5 +; CHECK-NEXT: mov r5, r3 +; CHECK-NEXT: vmov r10, r12, d5 ; CHECK-NEXT: vstrb.8 q2, [r6], #16 -; CHECK-NEXT: mov r3, r5 -; CHECK-NEXT: mov r12, r5 ; CHECK-NEXT: le lr, .LBB19_5 ; CHECK-NEXT: .LBB19_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: beq .LBB19_1 ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: ldrd lr, r4, [r1] -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: ldrd r2, r1, [r1, #8] -; CHECK-NEXT: vldrw.u32 q6, [r0, #16] -; CHECK-NEXT: vldrw.u32 q7, [r0, #32] -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: ldrd lr, r0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r11] +; CHECK-NEXT: ldrd r8, r1, [r1, #8] +; CHECK-NEXT: vldrw.u32 q6, [r11, #16] +; CHECK-NEXT: vldrw.u32 q7, [r11, #32] +; CHECK-NEXT: vldrw.u32 q4, [r11, #48] ; CHECK-NEXT: vmul.f32 q0, q0, r1 -; CHECK-NEXT: vldrw.u32 q5, [r0, #64] -; CHECK-NEXT: vfma.f32 q0, q6, r2 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vfma.f32 q0, q7, r4 -; CHECK-NEXT: vldrw.u32 q2, [r0, #96] +; CHECK-NEXT: vldrw.u32 q5, [r11, #64] +; CHECK-NEXT: vfma.f32 q0, q6, r8 +; CHECK-NEXT: vldrw.u32 q3, [r11, #80] +; CHECK-NEXT: vfma.f32 q0, q7, r0 +; CHECK-NEXT: vldrw.u32 q2, [r11, #96] ; CHECK-NEXT: vfma.f32 q0, q4, lr -; CHECK-NEXT: vldrw.u32 q1, [r0, #112] -; CHECK-NEXT: vfma.f32 q0, q5, r5 -; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: vfma.f32 q0, q3, r11 -; CHECK-NEXT: vfma.f32 q0, q2, r8 +; CHECK-NEXT: vldrw.u32 q1, [r11, #112] +; CHECK-NEXT: vfma.f32 q0, q5, r3 +; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: vfma.f32 q0, q3, r4 +; CHECK-NEXT: vfma.f32 q0, q2, r12 ; CHECK-NEXT: vfma.f32 q0, q1, r10 -; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: bne .LBB19_9 ; CHECK-NEXT: @ %bb.8: @ %if.then58 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: str r5, [r6] -; CHECK-NEXT: mov r2, lr -; CHECK-NEXT: mov r4, r12 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r4, [r6] +; CHECK-NEXT: mov r8, lr +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r3, r4 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_9: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: vmov r8, s1 -; CHECK-NEXT: cmp r3, #2 +; CHECK-NEXT: vmov r12, s1 +; CHECK-NEXT: cmp r7, #2 ; CHECK-NEXT: vstr s1, [r6, #4] -; CHECK-NEXT: str r5, [r6] +; CHECK-NEXT: str r4, [r6] ; CHECK-NEXT: bne .LBB19_11 ; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: mov r4, lr -; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: mov r0, lr +; CHECK-NEXT: mov r12, r4 ; CHECK-NEXT: b .LBB19_12 ; CHECK-NEXT: .LBB19_11: @ %if.else64 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 @@ -1694,7 +1689,7 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: vstr s2, [r6, #8] ; CHECK-NEXT: .LBB19_12: @ %if.end69 ; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1 -; CHECK-NEXT: mov r12, r1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: b .LBB19_2 ; CHECK-NEXT: .LBB19_13: @ %do.end ; CHECK-NEXT: add sp, #16 @@ -1901,8 +1896,8 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: ldrd r6, r12, [r0, #4] ; CHECK-NEXT: lsr.w r8, r3, #1 ; CHECK-NEXT: ldrb r0, [r0] @@ -1910,11 +1905,11 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: b .LBB20_3 ; CHECK-NEXT: .LBB20_1: @ %if.else ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: vstr s12, [r6] +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vstr s4, [r6] ; CHECK-NEXT: .LBB20_2: @ %if.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vstr s14, [r6, #4] +; CHECK-NEXT: vstr s6, [r6, #4] ; CHECK-NEXT: add.w r12, r12, #20 ; CHECK-NEXT: adds r6, #8 ; CHECK-NEXT: subs r0, #1 @@ -1923,41 +1918,39 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: .LBB20_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB20_5 Depth 2 -; CHECK-NEXT: vldrw.u32 q2, [r12] +; CHECK-NEXT: vldrw.u32 q3, [r12] ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vshlc q4, r5, #32 -; CHECK-NEXT: vldrw.u32 q1, [r12, #8] -; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vldrw.u32 q2, [r12, #8] +; CHECK-NEXT: vmov q5, q2 ; CHECK-NEXT: vshlc q5, r5, #32 -; CHECK-NEXT: vldrw.u32 q3, [r6] -; CHECK-NEXT: vmov.f32 s14, s0 +; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: mov r5, r2 -; CHECK-NEXT: vmov.f32 s15, s0 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: wls lr, r8, .LBB20_6 ; CHECK-NEXT: @ %bb.4: @ %while.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: .LBB20_5: @ %while.body ; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrd r7, r4, [r1], #8 -; CHECK-NEXT: vfma.f32 q6, q2, r7 -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vmov q3, q6 -; CHECK-NEXT: vfma.f32 q3, q1, r7 -; CHECK-NEXT: vstr s24, [r5] -; CHECK-NEXT: vmov.f32 s15, s0 -; CHECK-NEXT: vfma.f32 q3, q4, r4 -; CHECK-NEXT: vmov r4, s13 -; CHECK-NEXT: vstr s13, [r5, #4] -; CHECK-NEXT: vfma.f32 q3, q5, r4 +; CHECK-NEXT: vfma.f32 q1, q3, r7 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vfma.f32 q1, q2, r7 +; CHECK-NEXT: vmov.f32 s7, s0 +; CHECK-NEXT: vfma.f32 q1, q4, r4 +; CHECK-NEXT: vmov r4, s5 +; CHECK-NEXT: vstr s5, [r5, #4] +; CHECK-NEXT: vfma.f32 q1, q5, r4 +; CHECK-NEXT: vmov.f32 s4, s6 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vstr s2, [r5] ; CHECK-NEXT: adds r5, #8 -; CHECK-NEXT: vmov.f32 s12, s14 -; CHECK-NEXT: vmov.f32 s13, s15 -; CHECK-NEXT: vmov.f32 s14, s0 -; CHECK-NEXT: vmov q6, q3 ; CHECK-NEXT: le lr, .LBB20_5 ; CHECK-NEXT: .LBB20_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 @@ -1966,14 +1959,14 @@ define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocaptur ; CHECK-NEXT: @ %bb.7: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1 ; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vfma.f32 q3, q2, r1 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vstr s12, [r5] -; CHECK-NEXT: vfma.f32 q3, q1, r1 -; CHECK-NEXT: vstr s13, [r6] +; CHECK-NEXT: vfma.f32 q1, q3, r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vstr s4, [r5] +; CHECK-NEXT: vfma.f32 q1, q2, r1 +; CHECK-NEXT: vstr s5, [r6] ; CHECK-NEXT: b .LBB20_2 ; CHECK-NEXT: .LBB20_8: @ %do.end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.9: diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index 0d86f22a321e0..b60ee7c6d406b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -1313,27 +1313,29 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 ; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: .LBB16_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q1, q5, r0 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vadd.i32 q6, q5, r0 +; CHECK-NEXT: vmov r7, r3, d13 ; CHECK-NEXT: vadd.i32 q2, q4, r0 -; CHECK-NEXT: vmov r7, r3, d3 -; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: vmov r5, r6, d5 +; CHECK-NEXT: vmov q1, q7 +; CHECK-NEXT: vmov r4, r10, d12 +; CHECK-NEXT: vadd.i32 q6, q0, lr ; CHECK-NEXT: subs.w r9, r9, #16 -; CHECK-NEXT: vmov r4, r10, d2 -; CHECK-NEXT: vadd.i32 q1, q7, lr ; CHECK-NEXT: vadd.i32 q4, q4, lr ; CHECK-NEXT: vadd.i32 q5, q5, lr +; CHECK-NEXT: vadd.i32 q7, q7, lr ; CHECK-NEXT: ldrb.w r11, [r3] ; CHECK-NEXT: ldrb r3, [r7] ; CHECK-NEXT: vmov r7, r12, d4 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vadd.i32 q7, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q0, r0 ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r4, [r4] @@ -1342,7 +1344,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb.w r1, [r12] ; CHECK-NEXT: vmov.8 q0[0], r7 ; CHECK-NEXT: vmov.8 q0[1], r1 -; CHECK-NEXT: vmov r1, r7, d15 +; CHECK-NEXT: vmov r1, r7, d3 ; CHECK-NEXT: vmov.8 q0[2], r5 ; CHECK-NEXT: vmov.8 q0[3], r6 ; CHECK-NEXT: vmov.8 q0[4], r4 @@ -1357,8 +1359,7 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: ldrb r3, [r5] ; CHECK-NEXT: ldrb.w r12, [r7] ; CHECK-NEXT: ldrb r5, [r4] -; CHECK-NEXT: vmov r4, r7, d14 -; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vmov r4, r7, d2 ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: vmov.8 q0[8], r4 @@ -1370,7 +1371,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: vmov.8 q0[14], r3 ; CHECK-NEXT: vmov.8 q0[15], r12 ; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vmov q0, q6 ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index eedca2cd4a5d3..c0b2da7eff41b 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -236,11 +236,11 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(ptr noalias nocapture r ; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vadd.i32 q3, q1, q0 +; CHECK-NEXT: vldrw.u32 q3, [r0, q1, uxtw #2] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] -; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vmov q2, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vstrw.32 q3, [r0, q2, uxtw #2] ; CHECK-NEXT: bne .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr @@ -330,20 +330,20 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r4, .LCPI7_0 ; CHECK-NEXT: mov.w r12, #9 -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 +; CHECK-NEXT: vdup.32 q1, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vadd.i32 q2, q1, r4 -; CHECK-NEXT: vmla.i32 q3, q1, lr -; CHECK-NEXT: vmul.i32 q1, q1, r12 -; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmla.i32 q3, q2, lr ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q1, [r3] -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vldrw.u32 q4, [q3, #24] +; CHECK-NEXT: vmul.i32 q2, q2, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 +; CHECK-NEXT: vstrw.32 q2, [r3] ; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end @@ -390,22 +390,22 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(ptr noalias nocapture readonly %da ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r4, .LCPI8_0 ; CHECK-NEXT: movs r5, #18 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r4] ; CHECK-NEXT: mov.w r12, #9 ; CHECK-NEXT: mov.w lr, #12 ; CHECK-NEXT: movs r4, #8 -; CHECK-NEXT: vdup.32 q0, r0 -; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vdup.32 q2, r5 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q4, q0 -; CHECK-NEXT: vadd.i32 q3, q2, r4 -; CHECK-NEXT: vmla.i32 q4, q2, lr +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmla.i32 q4, q3, lr ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vldrw.u32 q5, [q4, #24] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmla.i32 q4, q2, r12 -; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmla.i32 q4, q3, r12 +; CHECK-NEXT: vadd.i32 q0, q0, r4 ; CHECK-NEXT: vstrb.8 q5, [r1], #16 ; CHECK-NEXT: vstrw.32 q4, [r3] ; CHECK-NEXT: bne .LBB8_1 @@ -487,21 +487,21 @@ define dso_local void @arm_mat_mult_q31(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 ; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov q7, q2 +; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: dls lr, r10 ; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vmlas.i32 q7, q0, r7 -; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vmlas.i32 q1, q0, r7 +; CHECK-NEXT: vmov q7, q4 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q0, q7, q3 -; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] -; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! -; CHECK-NEXT: vmul.i32 q1, q1, q7 -; CHECK-NEXT: vmov q7, q0 -; CHECK-NEXT: vadd.i32 q5, q1, q5 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [q7, #32]! +; CHECK-NEXT: vmul.i32 q0, q0, q6 +; CHECK-NEXT: vadd.i32 q5, q0, q5 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 @@ -702,12 +702,12 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q6, q5, q3 -; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1] -; CHECK-NEXT: vldrh.s32 q5, [r3], #8 -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov q5, q6 +; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vadd.i32 q5, q5, q3 +; CHECK-NEXT: vldrh.s32 q7, [r1, q6, uxtw #1] +; CHECK-NEXT: vldrh.s32 q6, [r3], #8 +; CHECK-NEXT: vmul.i32 q6, q7, q6 +; CHECK-NEXT: vadd.i32 q4, q6, q4 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 @@ -922,15 +922,15 @@ define hidden arm_aapcs_vfpcc i32 @arm_depthwise_conv_s8(ptr nocapture readonly ; CHECK-NEXT: @ Parent Loop BB11_3 Depth=3 ; CHECK-NEXT: @ Parent Loop BB11_4 Depth=4 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=5 -; CHECK-NEXT: vldrb.s32 q2, [r0, q5] -; CHECK-NEXT: vadd.i32 q7, q5, q0 -; CHECK-NEXT: vldrb.s32 q5, [r1, q4] -; CHECK-NEXT: vadd.i32 q6, q4, q0 -; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vmov q7, q5 +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vldrb.s32 q2, [r0, q7] +; CHECK-NEXT: vldrb.s32 q7, [r1, q6] ; CHECK-NEXT: subs r5, #4 -; CHECK-NEXT: vmlava.u32 r12, q2, q5 -; CHECK-NEXT: vmov q5, q7 -; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vadd.i32 q4, q4, q0 +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlava.u32 r12, q2, q7 ; CHECK-NEXT: bne .LBB11_5 ; CHECK-NEXT: @ %bb.6: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_4 Depth=4 diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll index 43ed5eefbf4c7..d6c5cde30ed73 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll @@ -18,50 +18,50 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32 ; CHECK-NEXT: csel r7, r6, r5, hs ; CHECK-NEXT: add.w lr, r7, #1 ; CHECK-NEXT: mov r4, r5 -; CHECK-NEXT: vldrh.u16 q0, [r0], #32 +; CHECK-NEXT: vldrh.u16 q1, [r0], #32 ; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: vldrh.u16 q2, [r1], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q2 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 ; CHECK-NEXT: vldrh.u16 q1, [r1], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 ; CHECK-NEXT: sub.w lr, lr, #1 ; CHECK-NEXT: cmp.w lr, #0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB0_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 -; CHECK-NEXT: vldrh.u16 q3, [r1, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q1, [r0], #32 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3 -; CHECK-NEXT: vldrh.u16 q0, [r1], #32 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 +; CHECK-NEXT: vldrh.u16 q2, [r1, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q3, [r0], #32 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q2 +; CHECK-NEXT: vldrh.u16 q1, [r1], #32 ; CHECK-NEXT: le lr, .LBB0_2 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q2 ; CHECK-NEXT: movs r6, #14 ; CHECK-NEXT: and.w r2, r6, r2, lsl #1 -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0 -; CHECK-NEXT: vldrh.u16 q2, [r0, #-16] -; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0 -; CHECK-NEXT: vldrh.u16 q0, [r1, #-16] -; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0 +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q3, q1 +; CHECK-NEXT: vldrh.u16 q0, [r0, #-16] +; CHECK-NEXT: vmlsldava.s16 r4, r7, q3, q1 +; CHECK-NEXT: vldrh.u16 q1, [r1, #-16] +; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1 ; CHECK-NEXT: vctp.16 r2 -; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrht.u16 q1, [r0] +; CHECK-NEXT: vldrht.u16 q2, [r0] ; CHECK-NEXT: cmp r2, #9 ; CHECK-NEXT: vpsttt ; CHECK-NEXT: vldrht.u16 q0, [r1] -; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0 -; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0 +; CHECK-NEXT: vmlsldavat.s16 r4, r7, q2, q0 +; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q2, q0 ; CHECK-NEXT: blo .LBB0_10 ; CHECK-NEXT: @ %bb.4: @ %do.body.1 ; CHECK-NEXT: subs r2, #8 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index 94d5490cead2f..6f2a0b2debc47 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -439,17 +439,18 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) { ; CHECK-NEXT: vmovx.f16 s1, s14 ; CHECK-NEXT: vmovx.f16 s20, s0 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmovx.f16 s21, s4 -; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s20, s2 +; CHECK-NEXT: vmovx.f16 s2, s6 ; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s8, s10 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s21, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vins.f16 s21, s2 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vmov.f32 s3, s12 ; CHECK-NEXT: vadd.i16 q0, q0, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index ab41069bfa258..ecb169898f9f0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -391,17 +391,18 @@ define void @vld4_v8i16_align1(ptr %src, ptr %dst) { ; CHECK-NEXT: vmovx.f16 s1, s2 ; CHECK-NEXT: vmovx.f16 s20, s8 ; CHECK-NEXT: vins.f16 s23, s1 -; CHECK-NEXT: vmovx.f16 s1, s10 -; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmovx.f16 s10, s10 ; CHECK-NEXT: vmovx.f16 s21, s12 -; CHECK-NEXT: vmovx.f16 s1, s14 +; CHECK-NEXT: vins.f16 s20, s10 +; CHECK-NEXT: vmovx.f16 s10, s14 ; CHECK-NEXT: vins.f16 s0, s2 ; CHECK-NEXT: vins.f16 s12, s14 ; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vins.f16 s21, s1 -; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vins.f16 s21, s10 ; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vins.f16 s8, s1 +; CHECK-NEXT: vmov.f32 s9, s12 ; CHECK-NEXT: vmov.f32 s11, s0 ; CHECK-NEXT: vadd.i16 q0, q2, q5 ; CHECK-NEXT: vadd.i16 q0, q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll index 04be18e3dd873..6656d44eec81e 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll @@ -344,14 +344,14 @@ define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) { ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmaxnma.f32 q1, q0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vmaxnma.f32 q0, q1 ; CHECK-NEXT: letp lr, .LBB19_1 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: vldr s0, .LCPI19_0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmaxnmav.f32 r0, q1 +; CHECK-NEXT: vldr s4, .LCPI19_0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmaxnmav.f32 r0, q0 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vstr s0, [r2] ; CHECK-NEXT: pop {r7, pc} @@ -538,14 +538,14 @@ define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) { ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: dlstp.16 lr, r1 ; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrh.u16 q1, [r0], #8 -; CHECK-NEXT: vmaxnma.f16 q1, q0 -; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrh.u16 q0, [r0], #8 +; CHECK-NEXT: vmaxnma.f16 q0, q1 ; CHECK-NEXT: letp lr, .LBB23_1 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: vldr.16 s0, .LCPI23_0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmaxnmav.f16 r0, q1 +; CHECK-NEXT: vldr.16 s4, .LCPI23_0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmaxnmav.f16 r0, q0 ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vstr.16 s0, [r2] ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 26ab555c2c593..fb5f543fd0d3a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -1055,18 +1055,18 @@ define void @vst4_v4f16(ptr %src, ptr %dst) { ; CHECK-NEXT: vins.f16 s12, s2 ; CHECK-NEXT: vmovx.f16 s2, s3 ; CHECK-NEXT: vins.f16 s11, s2 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s7 +; CHECK-NEXT: vins.f16 s6, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 ; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vins.f16 s10, s6 +; CHECK-NEXT: vins.f16 s10, s2 ; CHECK-NEXT: vmov.f32 s9, s1 ; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vstrh.16 q2, [r1, #16] -; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vmov.f32 s7, s12 ; CHECK-NEXT: vstrh.16 q1, [r1] ; CHECK-NEXT: pop {r4, r5, r6, pc} diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll index e6fcf56af6e8d..2929a04cc0637 100644 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll @@ -63,8 +63,8 @@ define hidden i32 @f(i32 %n) local_unnamed_addr #0 { ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: sub.w r3, r4, #16 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2 -; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: .LBB0_5: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r5, [r3, #16]! diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll index c9390d91d59c2..2b692bff0461e 100644 --- a/llvm/test/CodeGen/X86/3addr-16bit.ll +++ b/llvm/test/CodeGen/X86/3addr-16bit.ll @@ -10,27 +10,27 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test1: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: incl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: incl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB0_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB0_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test1: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: incl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB0_2 @@ -63,27 +63,27 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test2: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: decl %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: decl %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB1_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB1_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test2: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: decl %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB1_2 @@ -118,27 +118,27 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test3: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl $2, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl $2, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB2_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB2_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test3: ; X86: ## %bb.0: ## %entry ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl $2, %eax ; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx ; X86-NEXT: jne LBB2_2 @@ -171,19 +171,19 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X64-LABEL: test4: ; X64: ## %bb.0: ## %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: cmpw %di, %si +; X64-NEXT: addl %edi, %esi +; X64-NEXT: cmpw %di, %ax ; X64-NEXT: jne LBB3_2 ; X64-NEXT: ## %bb.1: ## %bb ; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl %ax, %ebx +; X64-NEXT: movzwl %si, %ebx ; X64-NEXT: movl %ebx, %edi ; X64-NEXT: callq _foo ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx ; X64-NEXT: retq ; X64-NEXT: LBB3_2: ## %bb1 -; X64-NEXT: movzwl %ax, %eax +; X64-NEXT: movzwl %si, %eax ; X64-NEXT: retq ; ; X86-LABEL: test4: @@ -191,8 +191,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp { ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: cmpw %cx, %dx ; X86-NEXT: jne LBB3_2 diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll index 805fc7ccaab76..2b34739fa80e3 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll @@ -1,76 +1,80 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefixes=CHECK,AVX define i32 @map0(ptr nocapture noundef readonly %a, i64 noundef %b) { - ; SSE-LABEL: name: map0 - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $rdi, $rsi - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi - ; SSE-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; SSE-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr) - ; SSE-NEXT: $eax = COPY [[MOV32rm]] - ; SSE-NEXT: RET 0, $eax - ; AVX-LABEL: name: map0 - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $rdi, $rsi - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi - ; AVX-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi - ; AVX-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr) - ; AVX-NEXT: $eax = COPY [[MOV32rm]] - ; AVX-NEXT: RET 0, $eax +; CHECK-LABEL: map0: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq %rsi, %r16 +; CHECK-NEXT: movq %rdi, %r17 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl (%r17,%r16,4), %eax +; CHECK-NEXT: retq entry: %add.ptr = getelementptr inbounds i32, ptr %a, i64 %b + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %0 = load i32, ptr %add.ptr ret i32 %0 } -define i32 @map1_or_vex(<2 x double> noundef %a) { - ; SSE-LABEL: name: map1_or_vex - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $xmm0 - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0 - ; SSE-NEXT: [[CVTSD2SIrr_Int:%[0-9]+]]:gr32 = nofpexcept CVTSD2SIrr_Int [[COPY]], implicit $mxcsr - ; SSE-NEXT: $eax = COPY [[CVTSD2SIrr_Int]] - ; SSE-NEXT: RET 0, $eax - ; AVX-LABEL: name: map1_or_vex - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $xmm0 - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0 - ; AVX-NEXT: [[VCVTSD2SIrr_Int:%[0-9]+]]:gr32_norex2 = nofpexcept VCVTSD2SIrr_Int [[COPY]], implicit $mxcsr - ; AVX-NEXT: $eax = COPY [[VCVTSD2SIrr_Int]] - ; AVX-NEXT: RET 0, $eax +define i32 @map1_or_vex(<2 x double> noundef %a) nounwind { +; SSE-LABEL: map1_or_vex: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cvtsd2si %xmm0, %r16d +; SSE-NEXT: #APP +; SSE-NEXT: nop +; SSE-NEXT: #NO_APP +; SSE-NEXT: movl %r16d, %eax +; SSE-NEXT: retq +; +; AVX-LABEL: map1_or_vex: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbx +; AVX-NEXT: vcvtsd2si %xmm0, %ebx +; AVX-NEXT: #APP +; AVX-NEXT: nop +; AVX-NEXT: #NO_APP +; AVX-NEXT: movl %ebx, %eax +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq entry: %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a) + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() ret i32 %0 } -define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) { - ; SSE-LABEL: name: map2_or_vex - ; SSE: bb.0.entry: - ; SSE-NEXT: liveins: $rdi, $rsi - ; SSE-NEXT: {{ $}} - ; SSE-NEXT: [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi - ; SSE-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; SSE-NEXT: [[PABSBrm:%[0-9]+]]:vr128 = PABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr) - ; SSE-NEXT: $xmm0 = COPY [[PABSBrm]] - ; SSE-NEXT: RET 0, $xmm0 - ; AVX-LABEL: name: map2_or_vex - ; AVX: bb.0.entry: - ; AVX-NEXT: liveins: $rdi, $rsi - ; AVX-NEXT: {{ $}} - ; AVX-NEXT: [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi - ; AVX-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; AVX-NEXT: [[VPABSBrm:%[0-9]+]]:vr128 = VPABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr) - ; AVX-NEXT: $xmm0 = COPY [[VPABSBrm]] - ; AVX-NEXT: RET 0, $xmm0 +define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) nounwind { +; SSE-LABEL: map2_or_vex: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %rsi, %rbx +; SSE-NEXT: movq %rdi, %r14 +; SSE-NEXT: #APP +; SSE-NEXT: nop +; SSE-NEXT: #NO_APP +; SSE-NEXT: pabsb (%r14,%rbx,4), %xmm0 +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: retq +; +; AVX-LABEL: map2_or_vex: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rsi, %rbx +; AVX-NEXT: movq %rdi, %r14 +; AVX-NEXT: #APP +; AVX-NEXT: nop +; AVX-NEXT: #NO_APP +; AVX-NEXT: vpabsb (%r14,%rbx,4), %xmm0 +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r14 +; AVX-NEXT: retq entry: + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %add.ptr = getelementptr inbounds i32, ptr %b, i64 %c %a = load <2 x i64>, ptr %add.ptr %0 = bitcast <2 x i64> %a to <16 x i8> diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll index 5fa4cb4c8826b..c193680607f76 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll @@ -1,17 +1,20 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr | FileCheck %s -define dso_local void @amx(ptr noundef %data) { - ; CHECK-LABEL: name: amx - ; CHECK: bb.0.entry: - ; CHECK-NEXT: liveins: $rdi - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_norex2_nosp = MOV32ri64 8 - ; CHECK-NEXT: PTILELOADD 4, [[COPY]], 1, killed [[MOV32ri64_]], 0, $noreg - ; CHECK-NEXT: RET 0 - entry: +define dso_local void @amx(ptr noundef %data) nounwind { +; CHECK-LABEL: amx: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl $8, %eax +; CHECK-NEXT: tileloadd (%rbx,%rax), %tmm4 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +entry: + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.tileloadd64(i8 4, ptr %data, i64 8) ret void } diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll index a9ca591a156c2..4692a58d095a6 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll @@ -1,17 +1,22 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr | FileCheck %s -define void @x87(ptr %0, ptr %1) { - ; CHECK-LABEL: name: x87 - ; CHECK: bb.0 (%ir-block.2): - ; CHECK-NEXT: liveins: $rdi, $rsi - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rsi - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m [[COPY1]], 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s32) from %ir.0) - ; CHECK-NEXT: nofpexcept ST_Fp32m [[COPY]], 1, $noreg, 0, $noreg, killed [[LD_Fp32m]], implicit-def dead $fpsw, implicit $fpcw :: (store (s32) into %ir.1) - ; CHECK-NEXT: RET 0 +define void @x87(ptr %0, ptr %1) nounwind { +; CHECK-LABEL: x87: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movq %rdi, %r14 +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: flds (%r14) +; CHECK-NEXT: fstps (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() %3 = load float, ptr %0 store float %3, ptr %1 ret void diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll index 86534427a9eae..f2025b5c8cbf8 100644 --- a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll +++ b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll @@ -1,70 +1,81 @@ -; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr | FileCheck %s -; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr | FileCheck %s -define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xsave - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XSAVE [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xsave: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movl %edx, %r16d +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax +; CHECK-NEXT: xsave (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xsave(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xsave(ptr, i32, i32) -define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xsave64 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XSAVE64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xsave64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movl %edx, %r16d +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax +; CHECK-NEXT: xsave64 (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xsave64(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xsave64(ptr, i32, i32) -define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xrstor - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XRSTOR [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xrstor: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movl %edx, %r16d +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax +; CHECK-NEXT: xrstor (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xrstor(ptr %ptr, i32 %hi, i32 %lo) ret void; } declare void @llvm.x86.xrstor(ptr, i32, i32) -define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) { - ; CHECK-LABEL: name: test_xrstor64 - ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $rdi, $esi, $edx - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi - ; CHECK-NEXT: $edx = COPY [[COPY1]] - ; CHECK-NEXT: $eax = COPY [[COPY]] - ; CHECK-NEXT: XRSTOR64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax - ; CHECK-NEXT: RET 0 +define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) nounwind { +; CHECK-LABEL: test_xrstor64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movl %edx, %r16d +; CHECK-NEXT: movl %esi, %edx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movl %r16d, %eax +; CHECK-NEXT: xrstor64 (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"() call void @llvm.x86.xrstor64(ptr %ptr, i32 %hi, i32 %lo) ret void; } diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll index b4d40fee01e41..71887e369bd18 100644 --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll @@ -2156,15 +2156,17 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB34_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movl %eax, %ecx -; X64-NEXT: xorl %edx, %ecx +; X64-NEXT: xorl %esi, %ecx ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: lock cmpxchgw %cx, (%rdi) ; X64-NEXT: # kill: def $ax killed $ax def $eax @@ -2172,12 +2174,12 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %ecx ; X64-NEXT: movw $123, %ax -; X64-NEXT: testl %ecx, %edx +; X64-NEXT: testl %ecx, %esi ; X64-NEXT: je .LBB34_3 ; X64-NEXT: # %bb.4: # %return ; X64-NEXT: retq ; X64-NEXT: .LBB34_3: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq entry: @@ -3398,10 +3400,12 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-LABEL: atomic_shl1_mask01_and_16_gpr_brnz: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %ecx, %edx ; X64-NEXT: andb $15, %cl -; X64-NEXT: movl $1, %edx -; X64-NEXT: shll %cl, %edx +; X64-NEXT: movl $1, %esi +; X64-NEXT: shll %cl, %esi ; X64-NEXT: movl $-2, %r8d +; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: roll %cl, %r8d ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: .p2align 4 @@ -3415,10 +3419,10 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n ; X64-NEXT: jne .LBB52_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end ; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: testl %eax, %edx +; X64-NEXT: testl %eax, %esi ; X64-NEXT: je .LBB52_3 ; X64-NEXT: # %bb.4: # %if.then -; X64-NEXT: movzwl %si, %eax +; X64-NEXT: movzwl %dx, %eax ; X64-NEXT: movzwl (%rdi,%rax,2), %eax ; X64-NEXT: retq ; X64-NEXT: .LBB52_3: diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll index 105ee7f82ee79..e118f5dbc1534 100644 --- a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll +++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll @@ -46,8 +46,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x ; CHECK-NEXT: orl %edx, %eax ; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx) ; CHECK-NEXT: setne %cl -; CHECK-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-NEXT: movl %eax, %edx ; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: pinsrw $0, %edx, %xmm0 ; CHECK-NEXT: pinsrw $0, %eax, %xmm1 ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: jne .LBB0_1 diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll index 45ef452f4f5c1..d652a540f3e9c 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll @@ -1,17 +1,13 @@ -;; Check the basic block sections list option. -;; version 0 profile: -; RUN: echo '!_Z3foob' > %t1 +;; Check that specifying the function in the basic block sections profile +;; without any other directives is a noop. ;; -;; version 1 profile: -; RUN: echo 'v1' > %t2 -; RUN: echo 'f _Z3foob' >> %t2 +;; Specify the bb sections profile: +; RUN: echo 'v1' > %t +; RUN: echo 'f _Z3foob' >> %t ;; -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX -; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t > %bbsections +; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %orig +; RUN: diff -u %orig %bbsections define i32 @_Z3foob(i1 zeroext %0) nounwind { %2 = alloca i32, align 4 @@ -41,45 +37,3 @@ define i32 @_Z3foob(i1 zeroext %0) nounwind { declare i32 @_Z3barv() #1 declare i32 @_Z3bazv() #1 - -define i32 @_Z3zipb(i1 zeroext %0) nounwind { - %2 = alloca i32, align 4 - %3 = alloca i8, align 1 - %4 = zext i1 %0 to i8 - store i8 %4, ptr %3, align 1 - %5 = load i8, ptr %3, align 1 - %6 = trunc i8 %5 to i1 - %7 = zext i1 %6 to i32 - %8 = icmp sgt i32 %7, 0 - br i1 %8, label %9, label %11 - -9: ; preds = %1 - %10 = call i32 @_Z3barv() - store i32 %10, ptr %2, align 4 - br label %13 - -11: ; preds = %1 - %12 = call i32 @_Z3bazv() - store i32 %12, ptr %2, align 4 - br label %13 - -13: ; preds = %11, %9 - %14 = load i32, ptr %2, align 4 - ret i32 %14 -} - -; LINUX-SECTIONS-NO-GUIDED-PREFIX: .section .text._Z3foob,"ax",@progbits -; LINUX-SECTIONS: .section .text.hot._Z3foob,"ax",@progbits -; LINUX-SECTIONS: _Z3foob: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.1,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.1: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.2,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.2: -; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.3,"ax",@progbits -; LINUX-SECTIONS: _Z3foob.__part.3: - -; LINUX-SECTIONS-FUNCTION-SECTION: .section .text._Z3zipb,"ax",@progbits -; LINUX-SECTIONS-NO-FUNCTION-SECTION-NOT: .section .text{{.*}}._Z3zipb,"ax",@progbits -; LINUX-SECTIONS: _Z3zipb: -; LINUX-SECTIONS-NOT: .section .text{{.*}}._Z3zipb.__part.{{[0-9]+}},"ax",@progbits -; LINUX-SECTIONS-NOT: _Z3zipb.__part.{{[0-9]+}}: diff --git a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll index d481b147662dc..6e0db20ca0492 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll @@ -1,6 +1,8 @@ -; RUN: echo "!foo" > %t.order.txt -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt | FileCheck --check-prefix=SOURCE-DRIFT %s -; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s +; RUN: echo "v1" > %t +; RUN: echo "f foo" >> %t +; RUN: echo "c 0" >> %t +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t | FileCheck --check-prefix=SOURCE-DRIFT %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s define dso_local i32 @foo(i1 zeroext %0, i1 zeroext %1) !annotation !1 { br i1 %0, label %5, label %3 diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll index 86d7df0c2d648..fae1ff90dd8d5 100644 --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -216,8 +216,8 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind { define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; SSE-LABEL: bitcast_v16i8_to_v2i8: ; SSE: # %bb.0: -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -225,8 +225,8 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind { ; ; AVX12-LABEL: bitcast_v16i8_to_v2i8: ; AVX12: # %bb.0: -; AVX12-NEXT: vpmovmskb %xmm0, %ecx -; AVX12-NEXT: movl %ecx, %eax +; AVX12-NEXT: vpmovmskb %xmm0, %eax +; AVX12-NEXT: movl %eax, %ecx ; AVX12-NEXT: shrl $8, %eax ; AVX12-NEXT: addb %cl, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax @@ -441,8 +441,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; SSE-LABEL: bitcast_v16i16_to_v2i8: ; SSE: # %bb.0: ; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -452,8 +452,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -464,8 +464,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax @@ -762,8 +762,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; SSE-NEXT: packssdw %xmm3, %xmm2 ; SSE-NEXT: packssdw %xmm1, %xmm0 ; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %ecx -; SSE-NEXT: movl %ecx, %eax +; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: movl %eax, %ecx ; SSE-NEXT: shrl $8, %eax ; SSE-NEXT: addb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax @@ -776,8 +776,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovmskb %xmm0, %ecx -; AVX1-NEXT: movl %ecx, %eax +; AVX1-NEXT: vpmovmskb %xmm0, %eax +; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: addb %cl, %al ; AVX1-NEXT: # kill: def $al killed $al killed $eax @@ -793,8 +793,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: movl %ecx, %eax +; AVX2-NEXT: vpmovmskb %xmm0, %eax +; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: addb %cl, %al ; AVX2-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 9d31c298bfb9e..e9e9ee9c97593 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE -; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE,SSE4 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512 @@ -956,6 +956,192 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { ret i1 %cmp } +; Load hidden behind bitcast +define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_ne_i128_bitcast: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $80, %esp +; X86-NEXT: movzbl 16(%ebp), %ecx +; X86-NEXT: movl 12(%ebp), %edx +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $1, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 56(%esp,%eax), %esi +; X86-NEXT: movl 60(%esp,%eax), %ebx +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %ebx +; X86-NEXT: movzwl 14(%edx), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll $16, %edi +; X86-NEXT: movzwl 12(%edx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: movl 52(%esp,%eax), %edx +; X86-NEXT: movzbl 16(%ebp), %ecx +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movzwl 10(%eax), %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll $16, %ebx +; X86-NEXT: movzwl 8(%eax), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: xorl %esi, %ebx +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl 48(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movzwl 6(%ecx), %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: shll $16, %eax +; X86-NEXT: movzwl 4(%ecx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: movzbl 16(%ebp), %ecx +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movzwl 2(%ecx), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll $16, %edx +; X86-NEXT: movzwl (%ecx), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: xorl %esi, %edx +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl %edi, 12(%ecx) +; X86-NEXT: movl %ebx, 8(%ecx) +; X86-NEXT: movl %eax, 4(%ecx) +; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movw %dx, 14(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movw %dx, 12(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movw %dx, 10(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movw %dx, 8(%eax) +; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: movw %dx, 6(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movw %dx, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 2(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; SSE2-LABEL: complement_ne_i128_bitcast: +; SSE2: # %bb.0: +; SSE2-NEXT: movl %esi, %ecx +; SSE2-NEXT: movl $1, %eax +; SSE2-NEXT: xorl %edx, %edx +; SSE2-NEXT: shldq %cl, %rax, %rdx +; SSE2-NEXT: xorl %esi, %esi +; SSE2-NEXT: shlq %cl, %rax +; SSE2-NEXT: testb $64, %cl +; SSE2-NEXT: cmovneq %rax, %rdx +; SSE2-NEXT: cmovneq %rsi, %rax +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: xorq %rdx, 8(%rdi) +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: xorq %rax, %rcx +; SSE2-NEXT: movq %rcx, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: complement_ne_i128_bitcast: +; SSE4: # %bb.0: +; SSE4-NEXT: movl %esi, %ecx +; SSE4-NEXT: movl $1, %eax +; SSE4-NEXT: xorl %edx, %edx +; SSE4-NEXT: shldq %cl, %rax, %rdx +; SSE4-NEXT: shlq %cl, %rax +; SSE4-NEXT: xorl %esi, %esi +; SSE4-NEXT: testb $64, %cl +; SSE4-NEXT: cmovneq %rax, %rdx +; SSE4-NEXT: cmovneq %rsi, %rax +; SSE4-NEXT: movdqa (%rdi), %xmm0 +; SSE4-NEXT: movq %xmm0, %rcx +; SSE4-NEXT: xorq %rax, %rcx +; SSE4-NEXT: pextrq $1, %xmm0, %rax +; SSE4-NEXT: xorq %rdx, %rax +; SSE4-NEXT: movq %rax, 8(%rdi) +; SSE4-NEXT: movq %rcx, (%rdi) +; SSE4-NEXT: retq +; +; AVX2-LABEL: complement_ne_i128_bitcast: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %esi, %ecx +; AVX2-NEXT: movl $1, %eax +; AVX2-NEXT: xorl %edx, %edx +; AVX2-NEXT: shldq %cl, %rax, %rdx +; AVX2-NEXT: xorl %esi, %esi +; AVX2-NEXT: shlxq %rcx, %rax, %rax +; AVX2-NEXT: testb $64, %cl +; AVX2-NEXT: cmovneq %rax, %rdx +; AVX2-NEXT: cmovneq %rsi, %rax +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: vpextrq $1, %xmm0, %rsi +; AVX2-NEXT: xorq %rax, %rcx +; AVX2-NEXT: xorq %rdx, %rsi +; AVX2-NEXT: movq %rsi, 8(%rdi) +; AVX2-NEXT: movq %rcx, (%rdi) +; AVX2-NEXT: retq +; +; AVX512-LABEL: complement_ne_i128_bitcast: +; AVX512: # %bb.0: +; AVX512-NEXT: movl %esi, %ecx +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movl $1, %edx +; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: shldq %cl, %rdx, %rsi +; AVX512-NEXT: shlxq %rcx, %rdx, %rdx +; AVX512-NEXT: testb $64, %cl +; AVX512-NEXT: cmovneq %rdx, %rsi +; AVX512-NEXT: cmovneq %rax, %rdx +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: xorq %rdx, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: xorq %rsi, %rcx +; AVX512-NEXT: movq %rcx, 8(%rdi) +; AVX512-NEXT: movq %rax, (%rdi) +; AVX512-NEXT: retq + %rem = and i32 %position, 127 + %ofs = zext nneg i32 %rem to i128 + %bit = shl nuw i128 1, %ofs + %ldv = load <8 x i16>, ptr %word + %ld = bitcast <8 x i16> %ldv to i128 + %test = and i128 %ld, %bit + %res = xor i128 %ld, %bit + store i128 %res, ptr %word + ret <8 x i16> %ldv +} + ; Multiple loads in store chain define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-LABEL: reset_multiload_i128: @@ -975,10 +1161,10 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-NEXT: btrl %edx, %ebx ; X86-NEXT: btl %edx, %edi ; X86-NEXT: movl %ebx, (%ecx,%esi) -; X86-NEXT: jae .LBB22_2 +; X86-NEXT: jae .LBB23_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: .LBB22_2: +; X86-NEXT: .LBB23_2: ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -994,10 +1180,10 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X64-NEXT: btrl %esi, %r8d ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: btl %esi, %r9d -; X64-NEXT: jb .LBB22_2 +; X64-NEXT: jb .LBB23_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: movl (%rdx), %eax -; X64-NEXT: .LBB22_2: +; X64-NEXT: .LBB23_2: ; X64-NEXT: movl %r8d, (%rdi,%rcx) ; X64-NEXT: retq %rem = and i32 %position, 127 @@ -1046,10 +1232,10 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind ; X86-NEXT: movl %edi, (%edx) ; X86-NEXT: movl (%eax), %eax ; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: jne .LBB23_2 +; X86-NEXT: jne .LBB24_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: addl %esi, %eax -; X86-NEXT: .LBB23_2: +; X86-NEXT: .LBB24_2: ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll index 4d41c8406f6e0..a42a715bdc6ab 100644 --- a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll +++ b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll @@ -7,8 +7,8 @@ define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0_(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0_: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB0_1 ; CHECK-NEXT: .p2align 4 @@ -68,8 +68,8 @@ _ZNK4llvm5APInt13getActiveBitsEv.exit.i.i: ; preds = %for.body.i.i.i.i.i define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0__assert(ptr %r) { ; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0__assert: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: jmp .LBB1_1 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll index c1beb7c803b2b..c9c88f7258435 100644 --- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll +++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll @@ -1031,31 +1031,30 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind { ; CHECK-NEXT: testl %edi, %edi ; CHECK-NEXT: je .LBB17_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $1, %r15d +; CHECK-NEXT: movl $1, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB17_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB17_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB17_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1199,32 +1198,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_ ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB21_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB21_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB21_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB21_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1251,32 +1249,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32 ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB22_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: orl $16, %ebx ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB22_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB22_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB22_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1303,31 +1300,30 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem( ; CHECK-NEXT: cmpl $3, %edi ; CHECK-NEXT: jb .LBB23_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: movl $7, %r15d +; CHECK-NEXT: movl $7, %ebp ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB23_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebp, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ebx ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebp, %eax +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $5, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB23_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB23_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: @@ -1404,32 +1400,31 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3 ; CHECK-NEXT: cmpl %edx, %edi ; CHECK-NEXT: jbe .LBB25_4 ; CHECK-NEXT: # %bb.1: # %for.body.preheader -; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movl %edx, %r15d -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edx, %ebx +; CHECK-NEXT: movl %esi, %ebp ; CHECK-NEXT: movl %edi, %r14d ; CHECK-NEXT: negl %r14d -; CHECK-NEXT: addl $-2, %r15d +; CHECK-NEXT: addl $-2, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB25_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movl %r15d, %eax +; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: divl %ebx +; CHECK-NEXT: divl %ebp ; CHECK-NEXT: movl %edx, %edi ; CHECK-NEXT: callq use.i32@PLT -; CHECK-NEXT: leal 1(%r14,%r15), %eax -; CHECK-NEXT: movl %r15d, %ecx -; CHECK-NEXT: incl %ecx +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: leal 1(%r14,%rax), %eax ; CHECK-NEXT: cmpl $-2, %eax -; CHECK-NEXT: movl %ecx, %r15d ; CHECK-NEXT: jne .LBB25_2 ; CHECK-NEXT: # %bb.3: ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .LBB25_4: # %for.cond.cleanup ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll index e223765eb887b..46b2571e196bb 100644 --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -490,20 +490,21 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind { define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_ashr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sarl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sarl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $6, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_ashr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: sarl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: sarl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: sarl $6, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = ashr exact i32 %a0, 3 %y = freeze i32 %x @@ -604,20 +605,21 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind { define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind { ; X86-LABEL: freeze_lshr_exact_extra_use: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shrl $3, %ecx -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrl $3, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: shrl $5, %eax +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: freeze_lshr_exact_extra_use: ; X64: # %bb.0: -; X64-NEXT: shrl $3, %edi -; X64-NEXT: movl %edi, (%rsi) ; X64-NEXT: movl %edi, %eax +; X64-NEXT: shrl $3, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrl $5, %eax +; X64-NEXT: movl %ecx, (%rsi) ; X64-NEXT: retq %x = lshr exact i32 %a0, 3 %y = freeze i32 %x diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll index cffd88c55bb0a..477a0dce5c81c 100644 --- a/llvm/test/CodeGen/X86/i128-mul.ll +++ b/llvm/test/CodeGen/X86/i128-mul.ll @@ -111,62 +111,63 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-NOBMI-NEXT: orl %ecx, %eax ; X86-NOBMI-NEXT: je .LBB1_3 ; X86-NOBMI-NEXT: # %bb.1: # %for.body.preheader -; X86-NOBMI-NEXT: xorl %eax, %eax -; X86-NOBMI-NEXT: xorl %edx, %edx +; X86-NOBMI-NEXT: xorl %esi, %esi ; X86-NOBMI-NEXT: xorl %ecx, %ecx -; X86-NOBMI-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NOBMI-NEXT: xorl %edi, %edi +; X86-NOBMI-NEXT: xorl %ebp, %ebp ; X86-NOBMI-NEXT: .p2align 4 ; X86-NOBMI-NEXT: .LBB1_2: # %for.body ; X86-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOBMI-NEXT: movl (%eax,%ecx,8), %edi -; X86-NOBMI-NEXT: movl 4(%eax,%ecx,8), %ebx +; X86-NOBMI-NEXT: movl (%eax,%edi,8), %ebp +; X86-NOBMI-NEXT: movl 4(%eax,%edi,8), %ebx ; X86-NOBMI-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NOBMI-NEXT: movl %edi, %eax -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebp +; X86-NOBMI-NEXT: movl %ebp, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NOBMI-NEXT: movl %ebx, %eax -; X86-NOBMI-NEXT: mull %esi -; X86-NOBMI-NEXT: movl %edx, %ebx -; X86-NOBMI-NEXT: movl %eax, %esi -; X86-NOBMI-NEXT: addl %ebp, %esi -; X86-NOBMI-NEXT: adcl $0, %ebx -; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) +; X86-NOBMI-NEXT: movl %eax, %ebx +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl $0, %edx +; X86-NOBMI-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NOBMI-NEXT: movl %ebp, %eax ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: mull %edx -; X86-NOBMI-NEXT: movl %edx, %ebp -; X86-NOBMI-NEXT: movl %eax, %edi -; X86-NOBMI-NEXT: addl %esi, %edi -; X86-NOBMI-NEXT: adcl %ebx, %ebp -; X86-NOBMI-NEXT: setb %bl +; X86-NOBMI-NEXT: movl %eax, %ebp +; X86-NOBMI-NEXT: addl %ebx, %ebp +; X86-NOBMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NOBMI-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ebx +; X86-NOBMI-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NOBMI-NEXT: mull {{[0-9]+}}(%esp) -; X86-NOBMI-NEXT: addl %ebp, %eax -; X86-NOBMI-NEXT: movzbl %bl, %esi -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NOBMI-NEXT: adcl %esi, %edx -; X86-NOBMI-NEXT: movl %ecx, %ebx -; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NOBMI-NEXT: adcl $0, %eax -; X86-NOBMI-NEXT: adcl $0, %edx -; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %ecx, (%esi,%ebx,8) -; X86-NOBMI-NEXT: movl %ebx, %ecx -; X86-NOBMI-NEXT: movl %edi, 4(%esi,%ebx,8) -; X86-NOBMI-NEXT: addl $1, %ecx -; X86-NOBMI-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-NOBMI-NEXT: adcl $0, %edi -; X86-NOBMI-NEXT: movl %ecx, %esi -; X86-NOBMI-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NOBMI-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NOBMI-NEXT: xorl %ebp, %edi -; X86-NOBMI-NEXT: orl %esi, %edi +; X86-NOBMI-NEXT: movl %eax, %esi +; X86-NOBMI-NEXT: addl %ebx, %esi +; X86-NOBMI-NEXT: movl %ecx, %eax +; X86-NOBMI-NEXT: movzbl (%esp), %ebx # 1-byte Folded Reload +; X86-NOBMI-NEXT: movl %edx, %ecx +; X86-NOBMI-NEXT: adcl %ebx, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NOBMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NOBMI-NEXT: adcl %eax, %ebp +; X86-NOBMI-NEXT: adcl $0, %esi +; X86-NOBMI-NEXT: adcl $0, %ecx +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl %edx, (%eax,%edi,8) +; X86-NOBMI-NEXT: movl %ebp, 4(%eax,%edi,8) +; X86-NOBMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NOBMI-NEXT: addl $1, %edi +; X86-NOBMI-NEXT: adcl $0, %ebp +; X86-NOBMI-NEXT: movl %edi, %eax +; X86-NOBMI-NEXT: xorl %edx, %eax +; X86-NOBMI-NEXT: movl %ebp, %edx +; X86-NOBMI-NEXT: xorl %ebx, %edx +; X86-NOBMI-NEXT: orl %eax, %edx ; X86-NOBMI-NEXT: jne .LBB1_2 ; X86-NOBMI-NEXT: .LBB1_3: # %for.end ; X86-NOBMI-NEXT: xorl %eax, %eax @@ -184,71 +185,66 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X86-BMI-NEXT: pushl %ebx ; X86-BMI-NEXT: pushl %edi ; X86-BMI-NEXT: pushl %esi -; X86-BMI-NEXT: subl $20, %esp +; X86-BMI-NEXT: subl $16, %esp ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: orl %ecx, %eax ; X86-BMI-NEXT: je .LBB1_3 ; X86-BMI-NEXT: # %bb.1: # %for.body.preheader -; X86-BMI-NEXT: xorl %ecx, %ecx -; X86-BMI-NEXT: xorl %eax, %eax +; X86-BMI-NEXT: xorl %esi, %esi +; X86-BMI-NEXT: xorl %edi, %edi ; X86-BMI-NEXT: xorl %ebx, %ebx -; X86-BMI-NEXT: xorl %ebp, %ebp +; X86-BMI-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-BMI-NEXT: .p2align 4 ; X86-BMI-NEXT: .LBB1_2: # %for.body ; X86-BMI-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI-NEXT: movl (%eax,%ebx,8), %ecx -; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %esi -; X86-BMI-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: movl 4(%eax,%ebx,8), %ebp +; X86-BMI-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI-NEXT: mulxl %eax, %edx, %edi +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %edx, %eax +; X86-BMI-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-BMI-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-BMI-NEXT: movl %esi, %edx -; X86-BMI-NEXT: mulxl %eax, %esi, %eax -; X86-BMI-NEXT: addl %edi, %esi -; X86-BMI-NEXT: adcl $0, %eax +; X86-BMI-NEXT: movl %ebp, %edx +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %eax, %ebp +; X86-BMI-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %ebp ; X86-BMI-NEXT: movl %ecx, %edx -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-BMI-NEXT: mulxl %ecx, %edi, %ebp -; X86-BMI-NEXT: addl %esi, %edi -; X86-BMI-NEXT: adcl %eax, %ebp +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %ecx, %edx +; X86-BMI-NEXT: addl %eax, %ecx +; X86-BMI-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-BMI-NEXT: movl %esi, %eax +; X86-BMI-NEXT: adcl %ebp, %edx +; X86-BMI-NEXT: movl %edx, %ebp ; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-BMI-NEXT: mulxl %ecx, %ecx, %eax +; X86-BMI-NEXT: mulxl {{[0-9]+}}(%esp), %esi, %edi ; X86-BMI-NEXT: setb %dl -; X86-BMI-NEXT: addl %ebp, %ecx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-BMI-NEXT: addl %ebp, %esi ; X86-BMI-NEXT: movzbl %dl, %edx -; X86-BMI-NEXT: adcl %edx, %eax -; X86-BMI-NEXT: movl %eax, %edx -; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-BMI-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-BMI-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X86-BMI-NEXT: adcl $0, %ecx -; X86-BMI-NEXT: adcl $0, %edx -; X86-BMI-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-BMI-NEXT: movl %eax, (%edx,%ebx,8) -; X86-BMI-NEXT: movl %edi, 4(%edx,%ebx,8) -; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-BMI-NEXT: adcl %edx, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-BMI-NEXT: addl %eax, %edx +; X86-BMI-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload +; X86-BMI-NEXT: adcl $0, %esi +; X86-BMI-NEXT: adcl $0, %edi +; X86-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %edx, (%eax,%ebx,8) +; X86-BMI-NEXT: movl %ecx, 4(%eax,%ebx,8) ; X86-BMI-NEXT: addl $1, %ebx -; X86-BMI-NEXT: adcl $0, %ebp -; X86-BMI-NEXT: movl %ebx, %edx -; X86-BMI-NEXT: xorl %esi, %edx -; X86-BMI-NEXT: movl %ebp, %esi -; X86-BMI-NEXT: xorl %edi, %esi -; X86-BMI-NEXT: orl %edx, %esi -; X86-BMI-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-BMI-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-BMI-NEXT: adcl $0, %ecx +; X86-BMI-NEXT: movl %ebx, %eax +; X86-BMI-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X86-BMI-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-BMI-NEXT: xorl %ebp, %ecx +; X86-BMI-NEXT: orl %eax, %ecx ; X86-BMI-NEXT: jne .LBB1_2 ; X86-BMI-NEXT: .LBB1_3: # %for.end ; X86-BMI-NEXT: xorl %eax, %eax ; X86-BMI-NEXT: xorl %edx, %edx -; X86-BMI-NEXT: addl $20, %esp +; X86-BMI-NEXT: addl $16, %esp ; X86-BMI-NEXT: popl %esi ; X86-BMI-NEXT: popl %edi ; X86-BMI-NEXT: popl %ebx @@ -261,11 +257,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: je .LBB1_3 ; X64-NOBMI-NEXT: # %bb.1: # %for.body.preheader ; X64-NOBMI-NEXT: movq %rdx, %r8 -; X64-NOBMI-NEXT: xorl %r10d, %r10d +; X64-NOBMI-NEXT: xorl %edx, %edx ; X64-NOBMI-NEXT: xorl %r9d, %r9d ; X64-NOBMI-NEXT: .p2align 4 ; X64-NOBMI-NEXT: .LBB1_2: # %for.body ; X64-NOBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: movq %rcx, %rax ; X64-NOBMI-NEXT: mulq (%r8,%r9,8) ; X64-NOBMI-NEXT: addq %r10, %rax @@ -273,7 +270,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-NOBMI-NEXT: movq %rax, (%rsi,%r9,8) ; X64-NOBMI-NEXT: incq %r9 ; X64-NOBMI-NEXT: cmpq %r9, %rdi -; X64-NOBMI-NEXT: movq %rdx, %r10 ; X64-NOBMI-NEXT: jne .LBB1_2 ; X64-NOBMI-NEXT: .LBB1_3: # %for.end ; X64-NOBMI-NEXT: xorl %eax, %eax @@ -285,11 +281,12 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: je .LBB1_3 ; X64-BMI-NEXT: # %bb.1: # %for.body.preheader ; X64-BMI-NEXT: movq %rdx, %rax -; X64-BMI-NEXT: xorl %r9d, %r9d +; X64-BMI-NEXT: xorl %edx, %edx ; X64-BMI-NEXT: xorl %r8d, %r8d ; X64-BMI-NEXT: .p2align 4 ; X64-BMI-NEXT: .LBB1_2: # %for.body ; X64-BMI-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: movq %rcx, %rdx ; X64-BMI-NEXT: mulxq (%rax,%r8,8), %r10, %rdx ; X64-BMI-NEXT: addq %r9, %r10 @@ -297,7 +294,6 @@ define i64 @mul1(i64 %n, ptr nocapture %z, ptr nocapture %x, i64 %y) nounwind { ; X64-BMI-NEXT: movq %r10, (%rsi,%r8,8) ; X64-BMI-NEXT: incq %r8 ; X64-BMI-NEXT: cmpq %r8, %rdi -; X64-BMI-NEXT: movq %rdx, %r9 ; X64-BMI-NEXT: jne .LBB1_2 ; X64-BMI-NEXT: .LBB1_3: # %for.end ; X64-BMI-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll index 53b70fa38958b..c98889b7d5cb3 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll @@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind { ; X86-LABEL: ne_and_with_dom_abs: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movswl %cx, %eax -; X86-NEXT: sarl $15, %eax -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: subl %eax, %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movswl %ax, %ecx +; X86-NEXT: sarl $15, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, %edx ; X86-NEXT: xorl $12312, %eax # imm = 0x3018 ; X86-NEXT: movzwl %ax, %esi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpw $64, %cx -; X86-NEXT: setne %cl +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: cmpw $64, %dx +; X86-NEXT: setne %dl ; X86-NEXT: cmpl $2345, %esi # imm = 0x929 ; X86-NEXT: jae .LBB3_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movb %cl, %dl -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movb %dl, %cl +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: .LBB3_2: ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index caec02eaa19c7..2f691e7ca8f5b 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -255,9 +255,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} -; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} -; X64-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X64-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} +; X64-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X64-KNL-NEXT: retq ; ; X86-KNL-LABEL: test7: @@ -271,9 +271,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { ; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X86-KNL-NEXT: kmovw %k1, %k2 ; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} -; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm2 -; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} -; X86-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; X86-KNL-NEXT: vmovdqa %ymm1, %ymm2 +; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} +; X86-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0 ; X86-KNL-NEXT: retl ; ; X64-SKX-LABEL: test7: diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll index a75d42ed0c50f..c058e37e0ce11 100644 --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -658,9 +658,9 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -710,9 +710,9 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind { ; X86: # %bb.0: ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setbe %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -765,9 +765,9 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %ecx +; X86-NEXT: movzwl (%eax), %eax ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -817,11 +817,11 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_reg_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx @@ -871,12 +871,12 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind { ; X86-LABEL: scalar_i16_signed_mem_mem: ; X86: # %bb.0: ; X86-NEXT: pushl %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movzwl (%ecx), %ecx -; X86-NEXT: movzwl (%eax), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl (%eax), %eax +; X86-NEXT: movzwl (%ecx), %edx ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: subw %dx, %ax ; X86-NEXT: setle %bl ; X86-NEXT: leal -1(%ebx,%ebx), %edx diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 73d459ba77026..8f97d2652bc53 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -403,11 +403,11 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: je .LBB3_1 ; X86-NEXT: # %bb.2: # %bb26.preheader ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_3: # %bb26 @@ -427,7 +427,6 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind { ; X86-NEXT: jb .LBB3_3 ; X86-NEXT: jmp .LBB3_4 ; X86-NEXT: .LBB3_1: -; X86-NEXT: xorl %eax, %eax ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB3_4: # %bb31 ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll index b1aa789e53cd7..a663f6a1dd376 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i16.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll @@ -715,8 +715,8 @@ define i16 @test_mul_by_66(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 66 @@ -757,8 +757,8 @@ define i16 @test_mul_by_520(i16 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $9, %eax -; X64-NEXT: leal (%rax,%rdi,8), %eax +; X64-NEXT: shll $9, %edi +; X64-NEXT: leal (%rdi,%rax,8), %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %mul = mul nsw i16 %x, 520 diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll index 79889b9ace406..4129b44ed3ddc 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i32.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll @@ -1155,16 +1155,16 @@ define i32 @test_mul_by_66(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $6, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax +; X64-HSW-NEXT: shll $6, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_66: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $6, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax +; X64-JAG-NEXT: shll $6, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_66: @@ -1241,16 +1241,16 @@ define i32 @test_mul_by_520(i32 %x) { ; X64-HSW: # %bb.0: ; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi ; X64-HSW-NEXT: movl %edi, %eax -; X64-HSW-NEXT: shll $9, %eax -; X64-HSW-NEXT: leal (%rax,%rdi,8), %eax +; X64-HSW-NEXT: shll $9, %edi +; X64-HSW-NEXT: leal (%rdi,%rax,8), %eax ; X64-HSW-NEXT: retq ; ; X64-JAG-LABEL: test_mul_by_520: ; X64-JAG: # %bb.0: ; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi ; X64-JAG-NEXT: movl %edi, %eax -; X64-JAG-NEXT: shll $9, %eax -; X64-JAG-NEXT: leal (%rax,%rdi,8), %eax +; X64-JAG-NEXT: shll $9, %edi +; X64-JAG-NEXT: leal (%rdi,%rax,8), %eax ; X64-JAG-NEXT: retq ; ; X86-NOOPT-LABEL: test_mul_by_520: diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll index a4fa1ee8c0029..b488653655728 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i8.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -425,8 +425,8 @@ define i8 @test_mul_by_66(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: leal (%rdi,%rax,2), %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 66 diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll index 283c00e17f21a..b6af7e1641a9c 100644 --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: subl $28, %esp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imull %ebp, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: imull %esi, %eax ; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill +; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.1: ## %bb10.preheader -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: sarl $31, %eax -; CHECK-NEXT: shrl $30, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: sarl $2, %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: sarl $31, %ebp +; CHECK-NEXT: shrl $30, %ebp +; CHECK-NEXT: addl %eax, %ebp +; CHECK-NEXT: sarl $2, %ebp +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.2: ## %bb.nph9 -; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: testl %esi, %esi ; CHECK-NEXT: jle LBB0_12 ; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: incl %eax ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_4: ## %bb6 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx -; CHECK-NEXT: movb %bl, (%edx,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %ebp, %esi +; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx +; CHECK-NEXT: movb %bl, (%edx,%edi) +; CHECK-NEXT: incl %edi +; CHECK-NEXT: cmpl %esi, %edi ; CHECK-NEXT: jl LBB0_4 ; CHECK-NEXT: ## %bb.5: ## %bb9 ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 ; CHECK-NEXT: incl %ecx ; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: cmpl %edi, %ecx +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: je LBB0_12 ; CHECK-NEXT: ## %bb.6: ## %bb7.preheader ; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1 -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: jmp LBB0_4 ; CHECK-NEXT: LBB0_12: ## %bb18.loopexit +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: addl %ebp, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: cmpl $1, %edi +; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp) ; CHECK-NEXT: jle LBB0_13 ; CHECK-NEXT: ## %bb.7: ## %bb.nph5 -; CHECK-NEXT: cmpl $2, %ebp +; CHECK-NEXT: cmpl $2, %esi ; CHECK-NEXT: jl LBB0_13 ; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split -; CHECK-NEXT: movl %ebp, %edx -; CHECK-NEXT: shrl $31, %edx -; CHECK-NEXT: addl %ebp, %edx -; CHECK-NEXT: sarl %edx +; CHECK-NEXT: movl %esi, %ebp +; CHECK-NEXT: shrl $31, %ebp +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: sarl %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $31, %ecx @@ -84,102 +84,103 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB0_10 Depth 2 ; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %edx, %edi ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi ; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: xorl %ebx, %ebx ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%ecx,%esi) -; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx -; CHECK-NEXT: movb %bl, (%eax,%esi) -; CHECK-NEXT: incl %esi -; CHECK-NEXT: cmpl %edx, %esi +; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%ecx,%ebx) +; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx +; CHECK-NEXT: movb %dl, (%eax,%ebx) +; CHECK-NEXT: incl %ebx +; CHECK-NEXT: cmpl %ebp, %ebx ; CHECK-NEXT: jl LBB0_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; CHECK-NEXT: incl %edi -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: addl $2, %esi -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %ebp, %eax +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; CHECK-NEXT: jl LBB0_9 ; CHECK-NEXT: LBB0_13: ## %bb20 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: cmpl $1, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: cmpl $1, %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK-NEXT: je LBB0_19 ; CHECK-NEXT: ## %bb.14: ## %bb20 -; CHECK-NEXT: cmpl $3, %eax +; CHECK-NEXT: cmpl $3, %ecx ; CHECK-NEXT: jne LBB0_24 ; CHECK-NEXT: ## %bb.15: ## %bb22 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_18 ; CHECK-NEXT: ## %bb.16: ## %bb.nph -; CHECK-NEXT: leal 15(%edi), %eax +; CHECK-NEXT: leal 15(%edx), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: addl %ebx, %ebx -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload -; CHECK-NEXT: addl %esi, %ecx -; CHECK-NEXT: addl %ecx, %ebx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: addl %ebp, %ebp +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: addl %edi, %ecx +; CHECK-NEXT: addl %ecx, %ebp +; CHECK-NEXT: addl %eax, %ebx +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_17: ## %bb23 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi ; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx -; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %ebp, %ebx +; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %edi +; CHECK-NEXT: addl %esi, %ebp +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %esi, %edx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 -; CHECK-NEXT: testl %edi, %edi +; CHECK-NEXT: testl %edx, %edx ; CHECK-NEXT: jle LBB0_22 ; CHECK-NEXT: ## %bb.20: ## %bb.nph11 -; CHECK-NEXT: movl %edi, %esi -; CHECK-NEXT: leal 15(%ebp), %eax +; CHECK-NEXT: leal 15(%esi), %eax ; CHECK-NEXT: andl $-16, %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: LBB0_21: ## %bb30 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subl $4, %esp -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %edx +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: pushl %ebx ; CHECK-NEXT: pushl %edi +; CHECK-NEXT: movl %ebx, %ebp ; CHECK-NEXT: movl %edx, %ebx ; CHECK-NEXT: calll _memcpy ; CHECK-NEXT: movl %ebx, %edx +; CHECK-NEXT: movl %ebp, %ebx ; CHECK-NEXT: addl $16, %esp -; CHECK-NEXT: addl %ebp, %edi -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; CHECK-NEXT: decl %esi +; CHECK-NEXT: addl %esi, %edi +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: decl %edx ; CHECK-NEXT: jne LBB0_21 ; CHECK-NEXT: LBB0_22: ## %bb33 -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %ecx, %edx ; CHECK-NEXT: LBB0_23: ## %bb33 -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: sarl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 -; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB0_25: ## %return @@ -523,38 +526,38 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind { ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload ; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_9: ## %bb13 ; CHECK-NEXT: ## =>This Loop Header: Depth=1 ; CHECK-NEXT: ## Child Loop BB1_10 Depth 2 -; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; CHECK-NEXT: andl $1, %ebx ; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill -; CHECK-NEXT: addl %edx, %ebx -; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: addl %esi, %edx +; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: xorl %esi, %esi ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB1_10: ## %bb14 ; CHECK-NEXT: ## Parent Loop BB1_9 Depth=1 ; CHECK-NEXT: ## => This Inner Loop Header: Depth=2 -; CHECK-NEXT: movzbl -2(%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%eax,%esi) -; CHECK-NEXT: movzbl (%ebx,%esi,4), %edx -; CHECK-NEXT: movb %dl, (%ecx,%esi) +; CHECK-NEXT: movzbl -2(%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%eax,%esi) +; CHECK-NEXT: movzbl (%edx,%esi,4), %ebx +; CHECK-NEXT: movb %bl, (%ecx,%esi) ; CHECK-NEXT: incl %esi ; CHECK-NEXT: cmpl %ebp, %esi ; CHECK-NEXT: jb LBB1_10 ; CHECK-NEXT: ## %bb.11: ## %bb17 ; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1 -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; CHECK-NEXT: incl %ebx -; CHECK-NEXT: addl %ebp, %ecx ; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload -; CHECK-NEXT: addl $2, %edx +; CHECK-NEXT: incl %edx +; CHECK-NEXT: addl %ebp, %ecx +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl $2, %esi ; CHECK-NEXT: addl %ebp, %eax -; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; CHECK-NEXT: jb LBB1_9 ; CHECK-NEXT: LBB1_13: ## %bb20 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll index 420f5ba5ab433..31a7f1125150b 100644 --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -219,12 +219,12 @@ define i64 @parity_64(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -264,12 +264,12 @@ define i32 @parity_64_trunc(i64 %x) { ; ; X64-NOPOPCNT-LABEL: parity_64_trunc: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al @@ -628,12 +628,12 @@ define i64 @parity_64_shift(i64 %0) { ; ; X64-NOPOPCNT-LABEL: parity_64_shift: ; X64-NOPOPCNT: # %bb.0: -; X64-NOPOPCNT-NEXT: movq %rdi, %rax -; X64-NOPOPCNT-NEXT: shrq $32, %rax -; X64-NOPOPCNT-NEXT: xorl %edi, %eax -; X64-NOPOPCNT-NEXT: movl %eax, %ecx +; X64-NOPOPCNT-NEXT: movl %edi, %eax +; X64-NOPOPCNT-NEXT: shrq $32, %rdi +; X64-NOPOPCNT-NEXT: xorl %eax, %edi +; X64-NOPOPCNT-NEXT: movl %edi, %ecx ; X64-NOPOPCNT-NEXT: shrl $16, %ecx -; X64-NOPOPCNT-NEXT: xorl %eax, %ecx +; X64-NOPOPCNT-NEXT: xorl %edi, %ecx ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll index 21b25d87796a5..ffdb68c7a6c01 100644 --- a/llvm/test/CodeGen/X86/pr166744.ll +++ b/llvm/test/CodeGen/X86/pr166744.ll @@ -31,13 +31,13 @@ define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) { ; NOPOSTRA-LABEL: PR166744: ; NOPOSTRA: # %bb.0: ; NOPOSTRA-NEXT: movl %esi, %eax -; NOPOSTRA-NEXT: shrl $3, %eax -; NOPOSTRA-NEXT: andl $60, %eax -; NOPOSTRA-NEXT: movl (%rdi,%rax), %ecx -; NOPOSTRA-NEXT: btrl %esi, %ecx -; NOPOSTRA-NEXT: shlxl %esi, %edx, %edx -; NOPOSTRA-NEXT: orl %ecx, %edx -; NOPOSTRA-NEXT: movl %edx, (%rdi,%rax) +; NOPOSTRA-NEXT: shrl $3, %esi +; NOPOSTRA-NEXT: andl $60, %esi +; NOPOSTRA-NEXT: movl (%rdi,%rsi), %ecx +; NOPOSTRA-NEXT: btrl %eax, %ecx +; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax +; NOPOSTRA-NEXT: orl %ecx, %eax +; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi) ; NOPOSTRA-NEXT: movq 16(%rdi), %rax ; NOPOSTRA-NEXT: movq (%rdi), %rcx ; NOPOSTRA-NEXT: movq 8(%rdi), %rdx diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll index 8f046a4f5aea5..26e68861cf45c 100644 --- a/llvm/test/CodeGen/X86/rotate-extract.ll +++ b/llvm/test/CodeGen/X86/rotate-extract.ll @@ -203,10 +203,10 @@ define i16 @no_extract_mul(i16 %i) nounwind { ; X64-LABEL: no_extract_mul: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi +; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $8, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %ecx +; X64-NEXT: leal (%rax,%rax,8), %eax ; X64-NEXT: movzwl %ax, %eax ; X64-NEXT: shrl $9, %eax ; X64-NEXT: orl %ecx, %eax diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index ce56283df6010..8cb032776114b 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movslq %esi, %rax -; X64-NEXT: movslq %edi, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movslq %esi, %rcx +; X64-NEXT: movslq %edi, %rax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll index e5ea911d4771a..a93be22bf5861 100644 --- a/llvm/test/CodeGen/X86/sshl_sat.ll +++ b/llvm/test/CodeGen/X86/sshl_sat.ll @@ -15,16 +15,16 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movswl %dx, %esi +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movswl %di, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %esi ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testw %di, %di +; X64-NEXT: testw %dx, %dx ; X64-NEXT: sets %al ; X64-NEXT: addl $32767, %eax # imm = 0x7FFF -; X64-NEXT: cmpw %si, %di -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; @@ -33,17 +33,17 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movswl %dx, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %al ; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx -; X86-NEXT: cmovel %esi, %eax +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -58,18 +58,18 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax -; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movl %eax, %edx +; X64-NEXT: xorl %esi, %esi ; X64-NEXT: testw %ax, %ax -; X64-NEXT: sets %dl -; X64-NEXT: addl $32767, %edx # imm = 0x7FFF -; X64-NEXT: movl %eax, %esi -; X64-NEXT: shll %cl, %esi -; X64-NEXT: movswl %si, %edi +; X64-NEXT: sets %sil +; X64-NEXT: addl $32767, %esi # imm = 0x7FFF +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movswl %ax, %edi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: sarl %cl, %edi -; X64-NEXT: cmpw %di, %ax -; X64-NEXT: cmovnel %edx, %esi -; X64-NEXT: movswl %si, %eax +; X64-NEXT: cmpw %di, %dx +; X64-NEXT: cmovnel %esi, %eax +; X64-NEXT: cwtl ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index 10dee14bdd1a0..ff76707bdbb69 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -365,119 +365,118 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movswl %bx, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movswl %di, %ebp ; X86-NEXT: sarl %cl, %ebp ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %di, %di +; X86-NEXT: testw %bx, %bx ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %bp, %di -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovel %ebx, %ecx +; X86-NEXT: cmpw %bp, %bx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: cmovel %edi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: shll %cl, %edi -; X86-NEXT: movswl %di, %ebx -; X86-NEXT: sarl %cl, %ebx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %si, %si -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %bx, %si ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %esi ; X86-NEXT: shll %cl, %esi ; X86-NEXT: movswl %si, %edi ; X86-NEXT: sarl %cl, %edi -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testw %dx, %dx -; X86-NEXT: sets %al -; X86-NEXT: addl $32767, %eax # imm = 0x7FFF -; X86-NEXT: cmpw %di, %dx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %bx, %bx +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmpw %di, %bx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovel %esi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx +; X86-NEXT: cmovel %esi, %ebp ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movswl %dx, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %di, %di ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %di +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edx +; X86-NEXT: sarl %cl, %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testw %si, %si +; X86-NEXT: sets %cl +; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: cmpw %dx, %si +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: cmovel %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi -; X86-NEXT: sarl %cl, %esi +; X86-NEXT: movswl %dx, %eax +; X86-NEXT: sarl %cl, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmpw %ax, %si ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovel %edx, %ecx -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: movswl %dx, %esi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %esi ; X86-NEXT: sarl %cl, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %dx, %dx ; X86-NEXT: sets %bl ; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF -; X86-NEXT: cmpw %si, %ax -; X86-NEXT: cmovel %edx, %ebx +; X86-NEXT: cmpw %si, %dx +; X86-NEXT: cmovel %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %dl ; X86-NEXT: addl $32767, %edx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %edx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movswl %si, %edi +; X86-NEXT: shll %cl, %eax +; X86-NEXT: movswl %ax, %edi ; X86-NEXT: sarl %cl, %edi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: testw %ax, %ax +; X86-NEXT: testw %si, %si ; X86-NEXT: sets %cl ; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: cmovel %esi, %ecx +; X86-NEXT: cmpw %di, %si +; X86-NEXT: cmovel %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movw %cx, 14(%eax) ; X86-NEXT: movw %dx, 12(%eax) ; X86-NEXT: movw %bx, 10(%eax) -; X86-NEXT: movw %bp, 8(%eax) ; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movw %cx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 6(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, 4(%eax) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movw %cx, 2(%eax) +; X86-NEXT: movw %bp, 2(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: addl $16, %esp diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll index 72406aaa4efa8..9bf88cb8bdf81 100644 --- a/llvm/test/CodeGen/X86/stackmap.ll +++ b/llvm/test/CodeGen/X86/stackmap.ll @@ -1,7 +1,10 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -terminal-rule=0 | FileCheck %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. +; FIXME: Test should be fixed to produce the correct sized spill with +; -terminal-rule=0 flag removed + ; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps ; CHECK-NEXT: __LLVM_StackMaps: ; Header @@ -546,8 +549,8 @@ define void @clobberScratch(i32 %a) { ret void } -; A stack frame which needs to be realigned at runtime (to meet alignment -; criteria for values on the stack) does not have a fixed frame size. +; A stack frame which needs to be realigned at runtime (to meet alignment +; criteria for values on the stack) does not have a fixed frame size. ; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment ; CHECK-NEXT: .short 0 ; 0 locations diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 5bd624c0697a0..01fbafb18eb9f 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -2429,126 +2429,126 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY: # %bb.0: ; SSE2-ONLY-NEXT: movl (%rdi), %eax ; SSE2-ONLY-NEXT: notl %eax -; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: movl %eax, %ecx -; SSE2-ONLY-NEXT: shrl $16, %ecx -; SSE2-ONLY-NEXT: movb %cl, 2(%rsi) -; SSE2-ONLY-NEXT: movb %cl, 2(%rdx) -; SSE2-ONLY-NEXT: movw %ax, (%rdx) -; SSE2-ONLY-NEXT: movb %cl, 6(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 4(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 10(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 8(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 14(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 12(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 18(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 16(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 22(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 20(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 26(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 24(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 30(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 28(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 34(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 32(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 38(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 36(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 42(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 40(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 46(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 44(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 50(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 48(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 54(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 52(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 58(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 56(%rdx) -; SSE2-ONLY-NEXT: movb %cl, 62(%rdx) -; SSE2-ONLY-NEXT: movw %ax, 60(%rdx) +; SSE2-ONLY-NEXT: movw %ax, (%rsi) +; SSE2-ONLY-NEXT: shrl $16, %eax +; SSE2-ONLY-NEXT: movb %al, 2(%rsi) +; SSE2-ONLY-NEXT: movb %al, 2(%rdx) +; SSE2-ONLY-NEXT: movw %cx, (%rdx) +; SSE2-ONLY-NEXT: movb %al, 6(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 4(%rdx) +; SSE2-ONLY-NEXT: movb %al, 10(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 8(%rdx) +; SSE2-ONLY-NEXT: movb %al, 14(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 12(%rdx) +; SSE2-ONLY-NEXT: movb %al, 18(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 16(%rdx) +; SSE2-ONLY-NEXT: movb %al, 22(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 20(%rdx) +; SSE2-ONLY-NEXT: movb %al, 26(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 24(%rdx) +; SSE2-ONLY-NEXT: movb %al, 30(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 28(%rdx) +; SSE2-ONLY-NEXT: movb %al, 34(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 32(%rdx) +; SSE2-ONLY-NEXT: movb %al, 38(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 36(%rdx) +; SSE2-ONLY-NEXT: movb %al, 42(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 40(%rdx) +; SSE2-ONLY-NEXT: movb %al, 46(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 44(%rdx) +; SSE2-ONLY-NEXT: movb %al, 50(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 48(%rdx) +; SSE2-ONLY-NEXT: movb %al, 54(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 52(%rdx) +; SSE2-ONLY-NEXT: movb %al, 58(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 56(%rdx) +; SSE2-ONLY-NEXT: movb %al, 62(%rdx) +; SSE2-ONLY-NEXT: movw %cx, 60(%rdx) ; SSE2-ONLY-NEXT: retq ; ; SSE3-LABEL: vec384_v3i8: ; SSE3: # %bb.0: ; SSE3-NEXT: movl (%rdi), %eax ; SSE3-NEXT: notl %eax -; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: movl %eax, %ecx -; SSE3-NEXT: shrl $16, %ecx -; SSE3-NEXT: movb %cl, 2(%rsi) -; SSE3-NEXT: movb %cl, 2(%rdx) -; SSE3-NEXT: movw %ax, (%rdx) -; SSE3-NEXT: movb %cl, 6(%rdx) -; SSE3-NEXT: movw %ax, 4(%rdx) -; SSE3-NEXT: movb %cl, 10(%rdx) -; SSE3-NEXT: movw %ax, 8(%rdx) -; SSE3-NEXT: movb %cl, 14(%rdx) -; SSE3-NEXT: movw %ax, 12(%rdx) -; SSE3-NEXT: movb %cl, 18(%rdx) -; SSE3-NEXT: movw %ax, 16(%rdx) -; SSE3-NEXT: movb %cl, 22(%rdx) -; SSE3-NEXT: movw %ax, 20(%rdx) -; SSE3-NEXT: movb %cl, 26(%rdx) -; SSE3-NEXT: movw %ax, 24(%rdx) -; SSE3-NEXT: movb %cl, 30(%rdx) -; SSE3-NEXT: movw %ax, 28(%rdx) -; SSE3-NEXT: movb %cl, 34(%rdx) -; SSE3-NEXT: movw %ax, 32(%rdx) -; SSE3-NEXT: movb %cl, 38(%rdx) -; SSE3-NEXT: movw %ax, 36(%rdx) -; SSE3-NEXT: movb %cl, 42(%rdx) -; SSE3-NEXT: movw %ax, 40(%rdx) -; SSE3-NEXT: movb %cl, 46(%rdx) -; SSE3-NEXT: movw %ax, 44(%rdx) -; SSE3-NEXT: movb %cl, 50(%rdx) -; SSE3-NEXT: movw %ax, 48(%rdx) -; SSE3-NEXT: movb %cl, 54(%rdx) -; SSE3-NEXT: movw %ax, 52(%rdx) -; SSE3-NEXT: movb %cl, 58(%rdx) -; SSE3-NEXT: movw %ax, 56(%rdx) -; SSE3-NEXT: movb %cl, 62(%rdx) -; SSE3-NEXT: movw %ax, 60(%rdx) +; SSE3-NEXT: movw %ax, (%rsi) +; SSE3-NEXT: shrl $16, %eax +; SSE3-NEXT: movb %al, 2(%rsi) +; SSE3-NEXT: movb %al, 2(%rdx) +; SSE3-NEXT: movw %cx, (%rdx) +; SSE3-NEXT: movb %al, 6(%rdx) +; SSE3-NEXT: movw %cx, 4(%rdx) +; SSE3-NEXT: movb %al, 10(%rdx) +; SSE3-NEXT: movw %cx, 8(%rdx) +; SSE3-NEXT: movb %al, 14(%rdx) +; SSE3-NEXT: movw %cx, 12(%rdx) +; SSE3-NEXT: movb %al, 18(%rdx) +; SSE3-NEXT: movw %cx, 16(%rdx) +; SSE3-NEXT: movb %al, 22(%rdx) +; SSE3-NEXT: movw %cx, 20(%rdx) +; SSE3-NEXT: movb %al, 26(%rdx) +; SSE3-NEXT: movw %cx, 24(%rdx) +; SSE3-NEXT: movb %al, 30(%rdx) +; SSE3-NEXT: movw %cx, 28(%rdx) +; SSE3-NEXT: movb %al, 34(%rdx) +; SSE3-NEXT: movw %cx, 32(%rdx) +; SSE3-NEXT: movb %al, 38(%rdx) +; SSE3-NEXT: movw %cx, 36(%rdx) +; SSE3-NEXT: movb %al, 42(%rdx) +; SSE3-NEXT: movw %cx, 40(%rdx) +; SSE3-NEXT: movb %al, 46(%rdx) +; SSE3-NEXT: movw %cx, 44(%rdx) +; SSE3-NEXT: movb %al, 50(%rdx) +; SSE3-NEXT: movw %cx, 48(%rdx) +; SSE3-NEXT: movb %al, 54(%rdx) +; SSE3-NEXT: movw %cx, 52(%rdx) +; SSE3-NEXT: movb %al, 58(%rdx) +; SSE3-NEXT: movw %cx, 56(%rdx) +; SSE3-NEXT: movb %al, 62(%rdx) +; SSE3-NEXT: movw %cx, 60(%rdx) ; SSE3-NEXT: retq ; ; SSSE3-ONLY-LABEL: vec384_v3i8: ; SSSE3-ONLY: # %bb.0: ; SSSE3-ONLY-NEXT: movl (%rdi), %eax ; SSSE3-ONLY-NEXT: notl %eax -; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: movl %eax, %ecx -; SSSE3-ONLY-NEXT: shrl $16, %ecx -; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi) -; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, (%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx) -; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx) -; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx) +; SSSE3-ONLY-NEXT: movw %ax, (%rsi) +; SSSE3-ONLY-NEXT: shrl $16, %eax +; SSSE3-ONLY-NEXT: movb %al, 2(%rsi) +; SSSE3-ONLY-NEXT: movb %al, 2(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, (%rdx) +; SSSE3-ONLY-NEXT: movb %al, 6(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 4(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 10(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 8(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 14(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 12(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 18(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 16(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 22(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 20(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 26(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 24(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 30(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 28(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 34(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 32(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 38(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 36(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 42(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 40(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 46(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 44(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 50(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 48(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 54(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 52(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 58(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 56(%rdx) +; SSSE3-ONLY-NEXT: movb %al, 62(%rdx) +; SSSE3-ONLY-NEXT: movw %cx, 60(%rdx) ; SSSE3-ONLY-NEXT: retq ; ; SSE41-LABEL: vec384_v3i8: diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll index f20b777531c5a..3ad3e9a0e7655 100644 --- a/llvm/test/CodeGen/X86/twoaddr-lea.ll +++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll @@ -65,10 +65,10 @@ entry: define void @ham() { ; CHECK-LABEL: ham: ; CHECK: ## %bb.0: ## %bb +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rdx ; CHECK-NEXT: movq _global2@GOTPCREL(%rip), %rsi -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %cl, %cl ; CHECK-NEXT: je LBB3_2 ; CHECK-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index eacc714b49a4d..5a68484596a2f 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -10,10 +10,10 @@ declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32) define i32 @func(i32 %x, i32 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: imulq %rcx, %rax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: shrq $32, %rax ; X64-NEXT: shldl $30, %ecx, %eax ; X64-NEXT: # kill: def $eax killed $eax killed $rax diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll index e0e1ef7108d0d..9768e4761f47a 100644 --- a/llvm/test/CodeGen/X86/ushl_sat.ll +++ b/llvm/test/CodeGen/X86/ushl_sat.ll @@ -14,23 +14,23 @@ define i16 @func(i16 %x, i16 %y) nounwind { ; X64-LABEL: func: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx -; X64-NEXT: movl %edi, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %eax +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll %cl, %edi +; X64-NEXT: movzwl %di, %edx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %eax -; X64-NEXT: cmpw %ax, %di +; X64-NEXT: shrl %cl, %edx +; X64-NEXT: cmpw %dx, %ax ; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax +; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq ; ; X86-LABEL: func: ; X86: # %bb.0: ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movzwl %dx, %esi ; X86-NEXT: shrl %cl, %esi @@ -51,14 +51,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind { ; X64-NEXT: movsbl %dil, %eax ; X64-NEXT: addl %eax, %eax ; X64-NEXT: movl %eax, %edx -; X64-NEXT: shll %cl, %edx -; X64-NEXT: movzwl %dx, %esi +; X64-NEXT: shll %cl, %eax +; X64-NEXT: movzwl %ax, %esi ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %esi -; X64-NEXT: cmpw %si, %ax -; X64-NEXT: movl $65535, %eax # imm = 0xFFFF -; X64-NEXT: cmovel %edx, %eax -; X64-NEXT: cwtl +; X64-NEXT: cmpw %si, %dx +; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-NEXT: cmovel %eax, %ecx +; X64-NEXT: movswl %cx, %eax ; X64-NEXT: shrl %eax ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll index b8e83da9cf361..762088cfb2935 100644 --- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll @@ -300,95 +300,94 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %ax -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edi -; X86-NEXT: shrl %cl, %edi -; X86-NEXT: cmpw %di, %si +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %dx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $65535, %esi # imm = 0xFFFF -; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: movl $65535, %edx # imm = 0xFFFF +; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %bp -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebp ; X86-NEXT: shll %cl, %ebp -; X86-NEXT: movzwl %bp, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: cmpw %dx, %si -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bp, %eax +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: cmpw %ax, %di +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovnel %eax, %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %ebx -; X86-NEXT: movzwl %bx, %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: cmpw %si, %dx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl %bx, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $65535, %esi # imm = 0xFFFF ; X86-NEXT: cmovnel %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: shll %cl, %ebp +; X86-NEXT: movzwl %bp, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: shll %cl, %edi -; X86-NEXT: movzwl %di, %eax -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx +; X86-NEXT: movzwl %di, %edx +; X86-NEXT: shrl %cl, %edx +; X86-NEXT: cmpw %dx, %ax ; X86-NEXT: cmovnel %esi, %edi +; X86-NEXT: movl $65535, %ebx # imm = 0xFFFF ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi -; X86-NEXT: movzwl %si, %eax +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movzwl %dx, %eax ; X86-NEXT: shrl %cl, %eax -; X86-NEXT: cmpw %ax, %dx -; X86-NEXT: movl $65535, %eax # imm = 0xFFFF -; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmpw %ax, %si +; X86-NEXT: cmovnel %ebx, %edx ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shll %cl, %eax -; X86-NEXT: movzwl %ax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpw %dx, %cx +; X86-NEXT: movzwl %ax, %esi +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: cmpw %si, %bx ; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movw %ax, 14(%ecx) -; X86-NEXT: movw %si, 12(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) ; X86-NEXT: movw %di, 10(%ecx) -; X86-NEXT: movw %bx, 8(%ecx) -; X86-NEXT: movw %bp, 6(%ecx) +; X86-NEXT: movw %bp, 8(%ecx) ; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 6(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 4(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, 2(%ecx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movw %ax, (%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl $12, %esp +; X86-NEXT: addl $16, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll index b233855029c58..324fe12de9400 100644 --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -85,14 +85,14 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) { ; CHECK-NEXT: movswl %dx, %edx ; CHECK-NEXT: leal (,%rdx,4), %esi ; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shrl $16, %edi -; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: shrl $16, %esi +; CHECK-NEXT: shldw $1, %di, %si ; CHECK-NEXT: sarl $14, %edx ; CHECK-NEXT: cmpl $16384, %edx # imm = 0x4000 -; CHECK-NEXT: cmovgel %eax, %edi +; CHECK-NEXT: cmovgel %eax, %esi ; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000 -; CHECK-NEXT: cmovll %ecx, %edi -; CHECK-NEXT: pinsrw $3, %edi, %xmm1 +; CHECK-NEXT: cmovll %ecx, %esi +; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> , <4 x i16> %a, i32 15) @@ -106,19 +106,19 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $2, %xmm0, %eax ; CHECK-NEXT: leal (%rax,%rax,2), %eax ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: shldw $1, %ax, %cx -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shldw $1, %dx, %cx +; CHECK-NEXT: cmpl $32768, %eax # imm = 0x8000 ; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF ; CHECK-NEXT: cmovael %eax, %ecx ; CHECK-NEXT: pextrw $1, %xmm0, %edx ; CHECK-NEXT: addl %edx, %edx ; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shrl $16, %esi -; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shldw $1, %dx, %di -; CHECK-NEXT: cmpl $32768, %esi # imm = 0x8000 +; CHECK-NEXT: shrl $16, %edx +; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: shldw $1, %si, %di +; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %edi ; CHECK-NEXT: movd %xmm0, %edx ; CHECK-NEXT: xorl %esi, %esi @@ -133,10 +133,10 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) { ; CHECK-NEXT: pextrw $3, %xmm0, %ecx ; CHECK-NEXT: shll $2, %ecx ; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: shrl $16, %edx -; CHECK-NEXT: movl %edx, %esi -; CHECK-NEXT: shldw $1, %cx, %si -; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000 +; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl %ecx, %esi +; CHECK-NEXT: shldw $1, %dx, %si +; CHECK-NEXT: cmpl $32768, %ecx # imm = 0x8000 ; CHECK-NEXT: cmovael %eax, %esi ; CHECK-NEXT: pinsrw $3, %esi, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 320dce840ea57..6cb43234d713b 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -397,8 +397,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -409,8 +409,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vpmovw2m %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -421,8 +421,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vpmovw2m %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -722,8 +722,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -734,8 +734,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -746,8 +746,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -974,13 +974,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -990,13 +990,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512VL-NEXT: vpmovb2m %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1211,8 +1211,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1222,8 +1222,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind { ; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, zeroinitializer @@ -1427,8 +1427,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1439,8 +1439,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512BW-NEXT: vptestnmw %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1450,8 +1450,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind { ; AVX512VL-NEXT: vptestnmw %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1756,8 +1756,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1767,8 +1767,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1778,8 +1778,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind { ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2010,13 +2010,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2025,13 +2025,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2240,8 +2240,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2251,8 +2251,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind { ; AVX512VL-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: retq %a = icmp eq <16 x i8> %0, %1 @@ -2504,8 +2504,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2517,8 +2517,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2528,8 +2528,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind { ; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2845,8 +2845,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: movl %eax, %ecx -; AVX512F-NEXT: shrl $8, %ecx -; AVX512F-NEXT: xorb %al, %cl +; AVX512F-NEXT: shrl $8, %eax +; AVX512F-NEXT: xorb %cl, %al ; AVX512F-NEXT: setnp %al ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2856,8 +2856,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: movl %eax, %ecx -; AVX512BW-NEXT: shrl $8, %ecx -; AVX512BW-NEXT: xorb %al, %cl +; AVX512BW-NEXT: shrl $8, %eax +; AVX512BW-NEXT: xorb %cl, %al ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2867,8 +2867,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind { ; AVX512VL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: movl %eax, %ecx -; AVX512VL-NEXT: shrl $8, %ecx -; AVX512VL-NEXT: xorb %al, %cl +; AVX512VL-NEXT: shrl $8, %eax +; AVX512VL-NEXT: xorb %cl, %al ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -3097,13 +3097,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512BW-NEXT: kmovq %k0, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: shrq $32, %rcx -; AVX512BW-NEXT: xorl %eax, %ecx -; AVX512BW-NEXT: movl %ecx, %eax -; AVX512BW-NEXT: shrl $16, %eax +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrq $32, %rax ; AVX512BW-NEXT: xorl %ecx, %eax -; AVX512BW-NEXT: xorb %ah, %al +; AVX512BW-NEXT: movl %eax, %ecx +; AVX512BW-NEXT: shrl $16, %ecx +; AVX512BW-NEXT: xorl %eax, %ecx +; AVX512BW-NEXT: xorb %ch, %cl ; AVX512BW-NEXT: setnp %al ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -3112,13 +3112,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 ; AVX512VL-NEXT: kmovq %k0, %rax -; AVX512VL-NEXT: movq %rax, %rcx -; AVX512VL-NEXT: shrq $32, %rcx -; AVX512VL-NEXT: xorl %eax, %ecx -; AVX512VL-NEXT: movl %ecx, %eax -; AVX512VL-NEXT: shrl $16, %eax +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrq $32, %rax ; AVX512VL-NEXT: xorl %ecx, %eax -; AVX512VL-NEXT: xorb %ah, %al +; AVX512VL-NEXT: movl %eax, %ecx +; AVX512VL-NEXT: shrl $16, %ecx +; AVX512VL-NEXT: xorl %eax, %ecx +; AVX512VL-NEXT: xorb %ch, %cl ; AVX512VL-NEXT: setnp %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 3c98eba69ae5b..1c3d27fac4203 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -777,31 +777,31 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, (%esp) +; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: andb $12, %bl -; FALLBACK18-NEXT: movzbl %bl, %esi -; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi -; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx -; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: orl %ebp, %ecx -; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %ebp, %edi -; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx -; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi -; FALLBACK18-NEXT: shrxl %eax, %esi, %eax -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %edx -; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: movzbl %bl, %edi +; FALLBACK18-NEXT: movl 4(%esp,%edi), %ebx +; FALLBACK18-NEXT: movl 8(%esp,%edi), %esi +; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: notb %al +; FALLBACK18-NEXT: leal (%esi,%esi), %edx +; FALLBACK18-NEXT: shlxl %eax, %edx, %edx +; FALLBACK18-NEXT: orl %ebp, %edx +; FALLBACK18-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; FALLBACK18-NEXT: addl %ebx, %ebx +; FALLBACK18-NEXT: shlxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: orl %ebp, %ebx +; FALLBACK18-NEXT: movl 12(%esp,%edi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ebp +; FALLBACK18-NEXT: shlxl %eax, %ebp, %eax +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: orl %esi, %eax +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK18-NEXT: movl %eax, 12(%esi) -; FALLBACK18-NEXT: movl %edx, 8(%esi) -; FALLBACK18-NEXT: movl %edi, (%esi) -; FALLBACK18-NEXT: movl %ecx, 4(%esi) +; FALLBACK18-NEXT: movl %ecx, 12(%esi) +; FALLBACK18-NEXT: movl %eax, 8(%esi) +; FALLBACK18-NEXT: movl %ebx, (%esi) +; FALLBACK18-NEXT: movl %edx, 4(%esi) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -962,42 +962,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi -; FALLBACK22-NEXT: subl $44, %esp +; FALLBACK22-NEXT: subl $60, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movaps %xmm0, (%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: notb %cl -; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK22-NEXT: orl %ebx, %edx -; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebx, %edi -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: andb $12, %dl +; FALLBACK22-NEXT: movzbl %dl, %edi +; FALLBACK22-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %eax, %edx, %edx +; FALLBACK22-NEXT: orl %ebp, %edx +; FALLBACK22-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK22-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK22-NEXT: shlxl %eax, %edi, %edi +; FALLBACK22-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, 12(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) +; FALLBACK22-NEXT: movl %ecx, 12(%esi) +; FALLBACK22-NEXT: movl %eax, 4(%esi) ; FALLBACK22-NEXT: movl %edi, 8(%esi) ; FALLBACK22-NEXT: movl %edx, (%esi) -; FALLBACK22-NEXT: addl $44, %esp +; FALLBACK22-NEXT: addl $60, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx @@ -1152,42 +1153,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi -; FALLBACK26-NEXT: subl $44, %esp +; FALLBACK26-NEXT: subl $60, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: notb %cl -; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK26-NEXT: orl %ebx, %edx -; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebx, %edi -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: andb $12, %dl +; FALLBACK26-NEXT: movzbl %dl, %edi +; FALLBACK26-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %eax, %edx, %edx +; FALLBACK26-NEXT: orl %ebp, %edx +; FALLBACK26-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK26-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK26-NEXT: shlxl %eax, %edi, %edi +; FALLBACK26-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, 12(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) +; FALLBACK26-NEXT: movl %ecx, 12(%esi) +; FALLBACK26-NEXT: movl %eax, 4(%esi) ; FALLBACK26-NEXT: movl %edi, 8(%esi) ; FALLBACK26-NEXT: movl %edx, (%esi) -; FALLBACK26-NEXT: addl $44, %esp +; FALLBACK26-NEXT: addl $60, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx @@ -1342,42 +1344,43 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi -; FALLBACK30-NEXT: subl $44, %esp +; FALLBACK30-NEXT: subl $60, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: vmovaps %xmm0, (%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx +; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: notb %cl -; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp -; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx -; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx -; FALLBACK30-NEXT: orl %ebx, %edx -; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp -; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebx, %edi -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: andb $12, %dl +; FALLBACK30-NEXT: movzbl %dl, %edi +; FALLBACK30-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 20(%esp,%edi), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%esp,%edi), %ebx +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %eax, %edx, %edx +; FALLBACK30-NEXT: orl %ebp, %edx +; FALLBACK30-NEXT: movl 28(%esp,%edi), %ebp +; FALLBACK30-NEXT: leal (%ebp,%ebp), %edi +; FALLBACK30-NEXT: shlxl %eax, %edi, %edi +; FALLBACK30-NEXT: shrxl %ecx, %ebx, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: shlxl %eax, %ebx, %eax +; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: shrxl %ecx, %ebp, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, 12(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) +; FALLBACK30-NEXT: movl %ecx, 12(%esi) +; FALLBACK30-NEXT: movl %eax, 4(%esi) ; FALLBACK30-NEXT: movl %edi, 8(%esi) ; FALLBACK30-NEXT: movl %edx, (%esi) -; FALLBACK30-NEXT: addl $44, %esp +; FALLBACK30-NEXT: addl $60, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx @@ -1784,41 +1787,41 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 4(%ecx), %esi ; FALLBACK18-NEXT: movl 8(%ecx), %edi ; FALLBACK18-NEXT: movl 12(%ecx), %ecx -; FALLBACK18-NEXT: movzbl (%eax), %eax -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: shlb $3, %bl +; FALLBACK18-NEXT: movzbl (%eax), %ebx +; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: shlb $3, %al ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, (%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $12, %al -; FALLBACK18-NEXT: negb %al -; FALLBACK18-NEXT: movsbl %al, %edx -; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi -; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp -; FALLBACK18-NEXT: movl %ebx, %eax +; FALLBACK18-NEXT: movl %eax, %ecx +; FALLBACK18-NEXT: andb $12, %bl +; FALLBACK18-NEXT: negb %bl +; FALLBACK18-NEXT: movsbl %bl, %esi +; FALLBACK18-NEXT: movl 16(%esp,%esi), %ebx +; FALLBACK18-NEXT: movl 20(%esp,%esi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edi ; FALLBACK18-NEXT: notb %al -; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %eax, %edi, %edi -; FALLBACK18-NEXT: orl %esi, %edi -; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx -; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebp +; FALLBACK18-NEXT: shrl %ebx +; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; FALLBACK18-NEXT: movl 24(%esp,%esi), %esi +; FALLBACK18-NEXT: shlxl %ecx, %esi, %ecx +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %eax, %esi, %esi +; FALLBACK18-NEXT: orl %edi, %esi ; FALLBACK18-NEXT: shrl %edx -; FALLBACK18-NEXT: shrxl %eax, %edx, %edx -; FALLBACK18-NEXT: orl %esi, %edx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax -; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: shrxl %eax, %edx, %eax +; FALLBACK18-NEXT: orl %ecx, %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK18-NEXT: movl %ebp, (%ecx) ; FALLBACK18-NEXT: movl %eax, 8(%ecx) -; FALLBACK18-NEXT: movl %edx, 12(%ecx) -; FALLBACK18-NEXT: movl %edi, 4(%ecx) +; FALLBACK18-NEXT: movl %esi, 12(%ecx) +; FALLBACK18-NEXT: movl %ebx, 4(%ecx) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -1983,39 +1986,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, (%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $12, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %ecx -; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %esi, %edx -; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %esi, %ebp +; FALLBACK22-NEXT: movl %eax, %ecx +; FALLBACK22-NEXT: andb $12, %dl +; FALLBACK22-NEXT: negb %dl +; FALLBACK22-NEXT: movsbl %dl, %edx +; FALLBACK22-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK22-NEXT: notb %al +; FALLBACK22-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %eax, %esi, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK22-NEXT: movl %edi, %ebp ; FALLBACK22-NEXT: shrl %ebp -; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl %eax, (%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) -; FALLBACK22-NEXT: movl %edx, 12(%esi) +; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK22-NEXT: orl %ebx, %ebp +; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK22-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK22-NEXT: shrl %edx +; FALLBACK22-NEXT: shrxl %eax, %edx, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, (%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %ebp, 8(%edx) +; FALLBACK22-NEXT: movl %esi, 12(%edx) ; FALLBACK22-NEXT: addl $44, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -2175,39 +2178,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $12, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %ecx -; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %edi -; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %esi, %edx -; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %esi, %ebp +; FALLBACK26-NEXT: movl %eax, %ecx +; FALLBACK26-NEXT: andb $12, %dl +; FALLBACK26-NEXT: negb %dl +; FALLBACK26-NEXT: movsbl %dl, %edx +; FALLBACK26-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %eax, %esi, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK26-NEXT: movl %edi, %ebp ; FALLBACK26-NEXT: shrl %ebp -; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK26-NEXT: orl %edi, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl %eax, (%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) -; FALLBACK26-NEXT: movl %edx, 12(%esi) +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: orl %ebx, %ebp +; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK26-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK26-NEXT: shrl %edx +; FALLBACK26-NEXT: shrxl %eax, %edx, %eax +; FALLBACK26-NEXT: orl %edi, %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %ecx, (%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %ebp, 8(%edx) +; FALLBACK26-NEXT: movl %esi, 12(%edx) ; FALLBACK26-NEXT: addl $44, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -2367,39 +2370,39 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $12, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %ecx -; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %edi -; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %esi, %edx -; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: movl %eax, %ecx +; FALLBACK30-NEXT: andb $12, %dl +; FALLBACK30-NEXT: negb %dl +; FALLBACK30-NEXT: movsbl %dl, %edx +; FALLBACK30-NEXT: shlxl %ecx, 28(%esp,%edx), %edi +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 24(%esp,%edx), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebx +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %eax, %esi, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl 20(%esp,%edx), %edi +; FALLBACK30-NEXT: movl %edi, %ebp ; FALLBACK30-NEXT: shrl %ebp -; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp -; FALLBACK30-NEXT: orl %edi, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl %eax, (%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) -; FALLBACK30-NEXT: movl %edx, 12(%esi) +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: orl %ebx, %ebp +; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi +; FALLBACK30-NEXT: movl 16(%esp,%edx), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %ecx +; FALLBACK30-NEXT: shrl %edx +; FALLBACK30-NEXT: shrxl %eax, %edx, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, (%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %ebp, 8(%edx) +; FALLBACK30-NEXT: movl %esi, 12(%edx) ; FALLBACK30-NEXT: addl $44, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -2833,31 +2836,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl -; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl -; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax -; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx -; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%edi), %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%edi), %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %ebx, %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %al +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %edx, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%esp,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebx, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ebx +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%edi), %edi +; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%edi,%edi), %ebp +; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebp, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi +; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %eax +; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edi, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi) -; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 12(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 8(%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, (%esi) +; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi @@ -3208,30 +3211,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes: @@ -3355,30 +3358,30 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %esi ; FALLBACK6-NEXT: andb $24, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx +; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes: @@ -3487,35 +3490,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-LABEL: lshr_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: movzbl (%rsi), %eax +; FALLBACK10-NEXT: leal (,%rax,8), %ecx ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movl %ecx, %esi +; FALLBACK10-NEXT: andb $24, %al +; FALLBACK10-NEXT: movzbl %al, %eax +; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rax), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %rdi, %rcx +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 24(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -3623,35 +3626,35 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-LABEL: lshr_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: movzbl (%rsi), %eax +; FALLBACK14-NEXT: leal (,%rax,8), %ecx ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movl %ecx, %esi +; FALLBACK14-NEXT: andb $24, %al +; FALLBACK14-NEXT: movzbl %al, %eax +; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -64(%rsp,%rax), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rax), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rax), %rax +; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %rdi, %rcx +; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 24(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -3914,81 +3917,75 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %ebx, %ecx +; FALLBACK18-NEXT: shlb $3, %cl ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, %eax ; FALLBACK18-NEXT: andb $28, %bl -; FALLBACK18-NEXT: movzbl %bl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %esi, %edx +; FALLBACK18-NEXT: movzbl %bl, %esi +; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx +; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx ; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %ebx, %ecx -; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%eax,%eax), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx +; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %esi, %esi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: movl %esi, 28(%edi) +; FALLBACK18-NEXT: movl %ecx, 24(%edi) +; FALLBACK18-NEXT: movl %eax, 16(%edi) +; FALLBACK18-NEXT: movl %edx, 20(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -4261,72 +4258,70 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlb $3, %dl +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: shlb $3, %cl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %edx, %eax -; FALLBACK22-NEXT: notb %al -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl %eax, %ebp -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: movzbl %dl, %ebx +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK22-NEXT: movl %eax, %edi +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK22-NEXT: movl %ebp, %ecx -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK22-NEXT: orl %edx, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx +; FALLBACK22-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK22-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK22-NEXT: shrxl %edi, %edx, %esi +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK22-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %edi, %edx, %esi +; FALLBACK22-NEXT: movl %edi, %edx +; FALLBACK22-NEXT: orl %esi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebp +; FALLBACK22-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK22-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK22-NEXT: shrxl %edi, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebx ; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: movl %ecx, %edx -; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK22-NEXT: orl %ebp, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: orl %eax, %ebx ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, 28(%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %ebx, 24(%edx) +; FALLBACK22-NEXT: movl %edi, 16(%edx) +; FALLBACK22-NEXT: movl %ebp, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4585,70 +4580,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlb $3, %dl +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %ecx +; FALLBACK26-NEXT: shlb $3, %cl ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %edx, %eax -; FALLBACK26-NEXT: notb %al -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl %eax, %ebp -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: movzbl %dl, %ebx +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK26-NEXT: movl %eax, %edi +; FALLBACK26-NEXT: notb %cl +; FALLBACK26-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK26-NEXT: movl %ebp, %ecx -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK26-NEXT: orl %edx, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx +; FALLBACK26-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK26-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK26-NEXT: shrxl %edi, %edx, %esi +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK26-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %edi, %edx, %esi +; FALLBACK26-NEXT: movl %edi, %edx +; FALLBACK26-NEXT: orl %esi, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebp +; FALLBACK26-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK26-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK26-NEXT: shrxl %edi, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebx ; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: movl %ecx, %edx -; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK26-NEXT: orl %ebp, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) +; FALLBACK26-NEXT: orl %eax, %ebx ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %ecx, 28(%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %ebx, 24(%edx) +; FALLBACK26-NEXT: movl %edi, 16(%edx) +; FALLBACK26-NEXT: movl %ebp, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4906,70 +4899,68 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlb $3, %dl +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %ecx +; FALLBACK30-NEXT: shlb $3, %cl ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %edx, %eax -; FALLBACK30-NEXT: notb %al -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl %eax, %ebp -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax +; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: movzbl %dl, %ebx +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ebx), %edx +; FALLBACK30-NEXT: movl %eax, %edi +; FALLBACK30-NEXT: notb %cl +; FALLBACK30-NEXT: movl 36(%esp,%ebx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax -; FALLBACK30-NEXT: movl %ebp, %ecx -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %eax +; FALLBACK30-NEXT: orl %edx, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx +; FALLBACK30-NEXT: movl 48(%esp,%ebx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK30-NEXT: movl 44(%esp,%ebx), %edx +; FALLBACK30-NEXT: shrxl %edi, %edx, %esi +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK30-NEXT: movl 40(%esp,%ebx), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %edi, %edx, %esi +; FALLBACK30-NEXT: movl %edi, %edx +; FALLBACK30-NEXT: orl %esi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%ebx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebp +; FALLBACK30-NEXT: shlxl %ecx, %ebp, %ebp +; FALLBACK30-NEXT: movl 52(%esp,%ebx), %eax +; FALLBACK30-NEXT: shrxl %edi, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ecx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 60(%esp,%ebx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebx ; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: movl %ecx, %edx -; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi -; FALLBACK30-NEXT: orl %ebp, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: orl %eax, %ebx ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: shrxl %edx, %esi, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, 28(%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %ebx, 24(%edx) +; FALLBACK30-NEXT: movl %edi, 16(%edx) +; FALLBACK30-NEXT: movl %ebp, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -5157,30 +5148,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes_dwordOff: @@ -5307,30 +5298,30 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %esi ; FALLBACK6-NEXT: andb $6, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx +; FALLBACK6-NEXT: leaq (%rcx,%rcx), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: shrxq %rsi, %rcx, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes_dwordOff: @@ -5441,36 +5432,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK10-LABEL: lshr_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax -; FALLBACK10-NEXT: shlb $5, %al +; FALLBACK10-NEXT: movzbl (%rsi), %eax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: shlb $5, %cl ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $6, %cl -; FALLBACK10-NEXT: movzbl %cl, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movl %ecx, %esi +; FALLBACK10-NEXT: andb $6, %al +; FALLBACK10-NEXT: movzbl %al, %eax +; FALLBACK10-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rax,4), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK10-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %rdi, %rcx +; FALLBACK10-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 24(%rdx) +; FALLBACK10-NEXT: movq %rcx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -5580,36 +5571,36 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK14-LABEL: lshr_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax -; FALLBACK14-NEXT: shlb $5, %al +; FALLBACK14-NEXT: movzbl (%rsi), %eax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: shlb $5, %cl ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $6, %cl -; FALLBACK14-NEXT: movzbl %cl, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax -; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movl %ecx, %esi +; FALLBACK14-NEXT: andb $6, %al +; FALLBACK14-NEXT: movzbl %al, %eax +; FALLBACK14-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -64(%rsp,%rax,4), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rax,4), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rax,4), %rax +; FALLBACK14-NEXT: leaq (%rax,%rax), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %rdi, %rcx +; FALLBACK14-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 24(%rdx) +; FALLBACK14-NEXT: movq %rcx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -6025,31 +6016,31 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movsbq %sil, %rdi +; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 +; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes: @@ -6167,38 +6158,38 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: leal (,%rcx,8), %eax +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: negb %sil +; FALLBACK6-NEXT: movsbq %sil, %rsi +; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK6-NEXT: orq %r9, %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: movq %rsi, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes: @@ -6308,36 +6299,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-LABEL: shl_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: leal (,%rcx,8), %eax +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: negb %sil +; FALLBACK10-NEXT: movsbq %sil, %rsi +; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK10-NEXT: orq %r9, %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: movq %rsi, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -6446,36 +6437,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-LABEL: shl_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: leal (,%rcx,8), %eax +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: negb %sil +; FALLBACK14-NEXT: movsbq %sil, %rsi +; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK14-NEXT: orq %r9, %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: movq %rsi, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -6745,71 +6736,75 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %edx, %eax +; FALLBACK18-NEXT: movl %eax, %ebp ; FALLBACK18-NEXT: andb $28, %bl ; FALLBACK18-NEXT: negb %bl ; FALLBACK18-NEXT: movsbl %bl, %esi ; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %eax, %edi -; FALLBACK18-NEXT: movl %edx, %ecx -; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: movl 68(%esp,%esi), %ecx +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %ecx, %edi +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: shrxl %edx, %ebx, %ebx ; FALLBACK18-NEXT: orl %edi, %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, %edi ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax +; FALLBACK18-NEXT: shrxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp +; FALLBACK18-NEXT: movl %ebp, %esi +; FALLBACK18-NEXT: shlxl %ebp, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax -; FALLBACK18-NEXT: orl %ebx, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx -; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebx +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %edx, %ecx, %ecx +; FALLBACK18-NEXT: orl %ebx, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK18-NEXT: movl 80(%esp,%ebp), %ecx +; FALLBACK18-NEXT: movl %ecx, %ebx ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax -; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp +; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %esi, %ebx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %esi, %ecx, %ecx +; FALLBACK18-NEXT: movl %esi, %eax ; FALLBACK18-NEXT: shrl %edi -; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi -; FALLBACK18-NEXT: orl %eax, %edi -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp -; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrxl %edx, %edi, %edi +; FALLBACK18-NEXT: orl %ecx, %edi +; FALLBACK18-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: shlxl %esi, 92(%esp,%ecx), %ebp +; FALLBACK18-NEXT: movl 88(%esp,%ecx), %esi +; FALLBACK18-NEXT: shlxl %eax, %esi, %ecx ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK18-NEXT: shrxl %edx, %esi, %esi ; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: shrl %ebx -; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 28(%eax) -; FALLBACK18-NEXT: movl %edi, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) +; FALLBACK18-NEXT: shrxl %edx, %ebx, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: movl %ecx, (%edx) +; FALLBACK18-NEXT: movl %eax, 24(%edx) +; FALLBACK18-NEXT: movl %esi, 28(%edx) +; FALLBACK18-NEXT: movl %edi, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edx) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -7085,78 +7080,76 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: shlb $3, %cl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: negb %cl -; FALLBACK22-NEXT: movsbl %cl, %edx -; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi -; FALLBACK22-NEXT: movl %eax, %ebx -; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK22-NEXT: movl %ecx, %esi -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %edi +; FALLBACK22-NEXT: movl %ecx, %ebx +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: negb %dl +; FALLBACK22-NEXT: movsbl %dl, %edx +; FALLBACK22-NEXT: movl 84(%esp,%edx), %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK22-NEXT: notb %cl +; FALLBACK22-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %esi, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK22-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK22-NEXT: movl %esi, %edi ; FALLBACK22-NEXT: shrl %edi -; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK22-NEXT: orl %ecx, %ebp -; FALLBACK22-NEXT: shlxl %eax, %esi, %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK22-NEXT: movl %edi, %ebp +; FALLBACK22-NEXT: shrl %ebp +; FALLBACK22-NEXT: shrxl %ecx, %ebp, %ebp +; FALLBACK22-NEXT: orl %esi, %ebp +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK22-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK22-NEXT: movl %ebp, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %edi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shlxl %eax, %esi, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK22-NEXT: shlxl %eax, %edx, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: orl %ebp, %edi ; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: shrl %edx -; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK22-NEXT: orl %edi, %edx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK22-NEXT: movl %edi, (%esi) -; FALLBACK22-NEXT: movl %edx, 28(%esi) -; FALLBACK22-NEXT: movl %eax, 24(%esi) -; FALLBACK22-NEXT: movl %ecx, 4(%esi) -; FALLBACK22-NEXT: movl %ebp, 8(%esi) +; FALLBACK22-NEXT: shrxl %ecx, %eax, %esi +; FALLBACK22-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ecx, %eax, %eax +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK22-NEXT: movl %ebp, (%ecx) +; FALLBACK22-NEXT: movl %eax, 28(%ecx) +; FALLBACK22-NEXT: movl %esi, 24(%ecx) +; FALLBACK22-NEXT: movl %edi, 4(%ecx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 12(%esi) +; FALLBACK22-NEXT: movl %eax, 12(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 16(%esi) +; FALLBACK22-NEXT: movl %eax, 16(%ecx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%esi) +; FALLBACK22-NEXT: movl %eax, 20(%ecx) ; FALLBACK22-NEXT: addl $108, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -7410,76 +7403,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: negb %cl -; FALLBACK26-NEXT: movsbl %cl, %edx -; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi ; FALLBACK26-NEXT: movl %eax, %ebx -; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK26-NEXT: movl %ecx, %esi -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %edi +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: negb %dl +; FALLBACK26-NEXT: movsbl %dl, %edx +; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK26-NEXT: notb %al +; FALLBACK26-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK26-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %esi, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK26-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK26-NEXT: movl %esi, %edi ; FALLBACK26-NEXT: shrl %edi -; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK26-NEXT: orl %ecx, %ebp -; FALLBACK26-NEXT: shlxl %eax, %esi, %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK26-NEXT: movl %edi, %ebp +; FALLBACK26-NEXT: shrl %ebp +; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK26-NEXT: orl %esi, %ebp +; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK26-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK26-NEXT: movl %ebp, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %eax, %edi, %edi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shlxl %eax, %esi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK26-NEXT: shlxl %eax, %edx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %eax, %esi, %edi +; FALLBACK26-NEXT: orl %ebp, %edi +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %esi +; FALLBACK26-NEXT: movl 88(%esp,%edx), %ecx +; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebp +; FALLBACK26-NEXT: orl %ebp, %esi +; FALLBACK26-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK26-NEXT: movl %ebp, (%ecx) +; FALLBACK26-NEXT: movl %eax, 28(%ecx) +; FALLBACK26-NEXT: movl %esi, 24(%ecx) +; FALLBACK26-NEXT: movl %edi, 4(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: shrl %edx -; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK26-NEXT: orl %edi, %edx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK26-NEXT: movl %edi, (%esi) -; FALLBACK26-NEXT: movl %edx, 28(%esi) -; FALLBACK26-NEXT: movl %eax, 24(%esi) -; FALLBACK26-NEXT: movl %ecx, 4(%esi) -; FALLBACK26-NEXT: movl %ebp, 8(%esi) +; FALLBACK26-NEXT: movl %eax, 8(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 12(%esi) +; FALLBACK26-NEXT: movl %eax, 12(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 16(%esi) +; FALLBACK26-NEXT: movl %eax, 16(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%esi) +; FALLBACK26-NEXT: movl %eax, 20(%ecx) ; FALLBACK26-NEXT: addl $108, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -7732,76 +7723,74 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: negb %cl -; FALLBACK30-NEXT: movsbl %cl, %edx -; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi ; FALLBACK30-NEXT: movl %eax, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx -; FALLBACK30-NEXT: movl %ecx, %esi -; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx -; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %edi +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: negb %dl +; FALLBACK30-NEXT: movsbl %dl, %edx +; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %esi +; FALLBACK30-NEXT: notb %al +; FALLBACK30-NEXT: movl 80(%esp,%edx), %edi +; FALLBACK30-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %esi, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 76(%esp,%edx), %esi ; FALLBACK30-NEXT: movl %esi, %edi ; FALLBACK30-NEXT: shrl %edi -; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp -; FALLBACK30-NEXT: orl %ecx, %ebp -; FALLBACK30-NEXT: shlxl %eax, %esi, %edi -; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shlxl %eax, %esi, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi -; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx -; FALLBACK30-NEXT: shlxl %eax, %edx, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: shrl %edx -; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx -; FALLBACK30-NEXT: orl %edi, %edx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK30-NEXT: movl %edi, (%esi) -; FALLBACK30-NEXT: movl %edx, 28(%esi) -; FALLBACK30-NEXT: movl %eax, 24(%esi) -; FALLBACK30-NEXT: movl %ecx, 4(%esi) -; FALLBACK30-NEXT: movl %ebp, 8(%esi) +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: movl 72(%esp,%edx), %edi +; FALLBACK30-NEXT: movl %edi, %ebp +; FALLBACK30-NEXT: shrl %ebp +; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp +; FALLBACK30-NEXT: orl %esi, %ebp +; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %edi, %esi +; FALLBACK30-NEXT: movl 68(%esp,%edx), %ebp +; FALLBACK30-NEXT: movl %ebp, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %eax, %edi, %edi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %eax, %esi, %edi +; FALLBACK30-NEXT: orl %ebp, %edi +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %esi +; FALLBACK30-NEXT: movl 88(%esp,%edx), %ecx +; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %ebx, 92(%esp,%edx), %edx +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %eax, %ecx, %eax +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK30-NEXT: movl %ebp, (%ecx) +; FALLBACK30-NEXT: movl %eax, 28(%ecx) +; FALLBACK30-NEXT: movl %esi, 24(%ecx) +; FALLBACK30-NEXT: movl %edi, 4(%ecx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 12(%esi) +; FALLBACK30-NEXT: movl %eax, 12(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 16(%esi) +; FALLBACK30-NEXT: movl %eax, 16(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%esi) +; FALLBACK30-NEXT: movl %eax, 20(%ecx) ; FALLBACK30-NEXT: addl $108, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -7987,32 +7976,32 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: shlb $2, %sil ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil -; FALLBACK2-NEXT: movsbq %sil, %rsi -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 -; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movsbq %sil, %rdi +; FALLBACK2-NEXT: movq -40(%rsp,%rdi), %r8 +; FALLBACK2-NEXT: movq -32(%rsp,%rdi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; FALLBACK2-NEXT: movq -24(%rsp,%rdi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, (%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes_dwordOff: @@ -8135,40 +8124,40 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 -; FALLBACK6-NEXT: movzbl (%rsi), %ecx -; FALLBACK6-NEXT: movl %ecx, %eax +; FALLBACK6-NEXT: movzbl (%rsi), %esi +; FALLBACK6-NEXT: movl %esi, %eax ; FALLBACK6-NEXT: shlb $5, %al ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: shlb $2, %cl -; FALLBACK6-NEXT: andb $24, %cl -; FALLBACK6-NEXT: negb %cl -; FALLBACK6-NEXT: movsbq %cl, %rcx -; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: shlb $2, %sil +; FALLBACK6-NEXT: andb $24, %sil +; FALLBACK6-NEXT: negb %sil +; FALLBACK6-NEXT: movsbq %sil, %rsi +; FALLBACK6-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %rdi, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK6-NEXT: shrq %rsi +; FALLBACK6-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK6-NEXT: orq %r9, %rsi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: shrq %rcx -; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r8, %rcx -; FALLBACK6-NEXT: shrq %r9 -; FALLBACK6-NEXT: shrxq %rax, %r9, %rax +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, (%rdx) +; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, 24(%rdx) +; FALLBACK6-NEXT: movq %rsi, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes_dwordOff: @@ -8283,38 +8272,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK10-LABEL: shl_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK10-NEXT: movzbl (%rsi), %ecx -; FALLBACK10-NEXT: movl %ecx, %eax +; FALLBACK10-NEXT: movzbl (%rsi), %esi +; FALLBACK10-NEXT: movl %esi, %eax ; FALLBACK10-NEXT: shlb $5, %al ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: shlb $2, %cl -; FALLBACK10-NEXT: andb $24, %cl -; FALLBACK10-NEXT: negb %cl -; FALLBACK10-NEXT: movsbq %cl, %rcx -; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: shlb $2, %sil +; FALLBACK10-NEXT: andb $24, %sil +; FALLBACK10-NEXT: negb %sil +; FALLBACK10-NEXT: movsbq %sil, %rsi +; FALLBACK10-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rdi, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK10-NEXT: shrq %rsi +; FALLBACK10-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK10-NEXT: orq %r9, %rsi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: shrq %rcx -; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r8, %rcx -; FALLBACK10-NEXT: shrq %r9 -; FALLBACK10-NEXT: shrxq %rax, %r9, %rax +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, (%rdx) +; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, 24(%rdx) +; FALLBACK10-NEXT: movq %rsi, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -8428,38 +8417,38 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou ; FALLBACK14-LABEL: shl_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 -; FALLBACK14-NEXT: movzbl (%rsi), %ecx -; FALLBACK14-NEXT: movl %ecx, %eax +; FALLBACK14-NEXT: movzbl (%rsi), %esi +; FALLBACK14-NEXT: movl %esi, %eax ; FALLBACK14-NEXT: shlb $5, %al ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: shlb $2, %cl -; FALLBACK14-NEXT: andb $24, %cl -; FALLBACK14-NEXT: negb %cl -; FALLBACK14-NEXT: movsbq %cl, %rcx -; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 -; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 -; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 -; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: shlb $2, %sil +; FALLBACK14-NEXT: andb $24, %sil +; FALLBACK14-NEXT: negb %sil +; FALLBACK14-NEXT: movsbq %sil, %rsi +; FALLBACK14-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r9 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rdi, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rsi, %r10 +; FALLBACK14-NEXT: shrq %rsi +; FALLBACK14-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK14-NEXT: orq %r9, %rsi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rcx ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: shrq %rcx -; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r8, %rcx -; FALLBACK14-NEXT: shrq %r9 -; FALLBACK14-NEXT: shrxq %rax, %r9, %rax +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rax ; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, (%rdx) +; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, 24(%rdx) +; FALLBACK14-NEXT: movq %rsi, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -8906,30 +8895,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $24, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes: @@ -9067,30 +9056,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %ecx ; FALLBACK6-NEXT: andb $24, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movzbl %sil, %esi +; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes: @@ -9227,30 +9216,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movl %eax, %ecx ; FALLBACK10-NEXT: andb $24, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movzbl %sil, %esi +; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %rdi, %rax +; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes: @@ -9387,30 +9376,30 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andb $24, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movzbl %sil, %esi +; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %rdi, %rax +; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: movq %rcx, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes: @@ -9671,7 +9660,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK18-NEXT: movl (%esi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -9680,22 +9669,22 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 8(%esi), %ebx ; FALLBACK18-NEXT: movl 12(%esi), %ebp ; FALLBACK18-NEXT: movl 16(%esi), %edi -; FALLBACK18-NEXT: movzbl (%ecx), %ecx -; FALLBACK18-NEXT: movl 20(%esi), %edx +; FALLBACK18-NEXT: movzbl (%edx), %edx +; FALLBACK18-NEXT: movl 20(%esi), %ecx ; FALLBACK18-NEXT: movl 24(%esi), %eax ; FALLBACK18-NEXT: movl 28(%esi), %esi ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ecx, %eax -; FALLBACK18-NEXT: shlb $3, %al +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: shlb $3, %cl ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: sarl $31, %esi ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) @@ -9705,66 +9694,65 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: andb $28, %cl -; FALLBACK18-NEXT: movzbl %cl, %edi -; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx -; FALLBACK18-NEXT: movl %eax, %edx -; FALLBACK18-NEXT: notb %dl -; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp -; FALLBACK18-NEXT: orl %ebx, %ebp -; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx -; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal (%esi,%esi), %ebx -; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi -; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp -; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx -; FALLBACK18-NEXT: orl %ebx, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx -; FALLBACK18-NEXT: movl %eax, %ebx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax -; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %ecx, %eax +; FALLBACK18-NEXT: andb $28, %dl +; FALLBACK18-NEXT: movzbl %dl, %esi +; FALLBACK18-NEXT: movl 36(%esp,%esi), %edx +; FALLBACK18-NEXT: movl 40(%esp,%esi), %ebp +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: notb %cl +; FALLBACK18-NEXT: leal (%ebp,%ebp), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ebx +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%esi), %edi +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 48(%esp,%esi), %edx +; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %ebx +; FALLBACK18-NEXT: movl 44(%esp,%esi), %edx +; FALLBACK18-NEXT: shrxl %eax, %edx, %edi +; FALLBACK18-NEXT: orl %edi, %ebx +; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrxl %eax, %ebp, %edi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: addl %edx, %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi -; FALLBACK18-NEXT: orl %esi, %ecx -; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 56(%esp,%esi), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %edx +; FALLBACK18-NEXT: shlxl %ecx, %edx, %edx +; FALLBACK18-NEXT: movl 52(%esp,%esi), %eax +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebx +; FALLBACK18-NEXT: orl %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %esi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax -; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx -; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %edx, %edi, %edx -; FALLBACK18-NEXT: orl %eax, %edx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %ebx, 28(%eax) -; FALLBACK18-NEXT: movl %edx, 24(%eax) -; FALLBACK18-NEXT: movl %esi, 16(%eax) -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, (%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shlxl %ecx, %eax, %eax +; FALLBACK18-NEXT: orl %ebx, %eax +; FALLBACK18-NEXT: movl 60(%esp,%esi), %esi +; FALLBACK18-NEXT: leal (%esi,%esi), %ebx +; FALLBACK18-NEXT: shlxl %ecx, %ebx, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %ecx +; FALLBACK18-NEXT: sarxl %ebp, %esi, %esi +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edi +; FALLBACK18-NEXT: movl %esi, 28(%edi) +; FALLBACK18-NEXT: movl %ecx, 24(%edi) +; FALLBACK18-NEXT: movl %eax, 16(%edi) +; FALLBACK18-NEXT: movl %edx, 20(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, (%edi) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edi) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -10070,82 +10058,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movl 16(%ecx), %esi ; FALLBACK22-NEXT: movl 20(%ecx), %edi -; FALLBACK22-NEXT: movl 24(%ecx), %ebx -; FALLBACK22-NEXT: movl 28(%ecx), %edx -; FALLBACK22-NEXT: movzbl (%eax), %ecx -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shlb $3, %al -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl 24(%ecx), %ebp +; FALLBACK22-NEXT: movl 28(%ecx), %ecx +; FALLBACK22-NEXT: movzbl (%eax), %edx +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shlb $3, %bl +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: sarl $31, %edx -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: andb $28, %cl -; FALLBACK22-NEXT: movzbl %cl, %edi -; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %eax, %edx -; FALLBACK22-NEXT: notb %dl -; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK22-NEXT: movl %eax, %ecx -; FALLBACK22-NEXT: orl %ebx, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK22-NEXT: leal (%esi,%esi), %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: sarl $31, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK22-NEXT: movl %ebx, %eax +; FALLBACK22-NEXT: andb $28, %dl +; FALLBACK22-NEXT: movzbl %dl, %ecx +; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK22-NEXT: movl %eax, %ebp +; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK22-NEXT: addl %ebx, %ebx -; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK22-NEXT: orl %ebp, %ebx -; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK22-NEXT: sarxl %eax, %edi, %eax -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %edx, %edi, %edi -; FALLBACK22-NEXT: orl %ecx, %edi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: addl %ecx, %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK22-NEXT: orl %esi, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK22-NEXT: movl %eax, 28(%edx) -; FALLBACK22-NEXT: movl %ecx, 4(%edx) -; FALLBACK22-NEXT: movl %edi, 24(%edx) -; FALLBACK22-NEXT: movl %ebx, 16(%edx) +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: leal (%eax,%eax), %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK22-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK22-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK22-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK22-NEXT: movl %ebp, %edx +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ebp +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK22-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK22-NEXT: shrxl %edx, %eax, %edi +; FALLBACK22-NEXT: orl %edi, %ebp +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK22-NEXT: orl %eax, %esi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: addl %eax, %eax +; FALLBACK22-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK22-NEXT: orl %edx, %eax +; FALLBACK22-NEXT: sarxl %ebx, %ecx, %ecx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %ecx, 28(%edx) +; FALLBACK22-NEXT: movl %eax, 4(%edx) +; FALLBACK22-NEXT: movl %esi, 24(%edx) +; FALLBACK22-NEXT: movl %edi, 16(%edx) +; FALLBACK22-NEXT: movl %ebp, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -10446,82 +10434,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK26-NEXT: movl 16(%ecx), %esi ; FALLBACK26-NEXT: movl 20(%ecx), %edi -; FALLBACK26-NEXT: movl 24(%ecx), %ebx -; FALLBACK26-NEXT: movl 28(%ecx), %edx -; FALLBACK26-NEXT: movzbl (%eax), %ecx -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shlb $3, %al -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl 24(%ecx), %ebp +; FALLBACK26-NEXT: movl 28(%ecx), %ecx +; FALLBACK26-NEXT: movzbl (%eax), %edx +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: shlb $3, %bl +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: sarl $31, %edx -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: andb $28, %cl -; FALLBACK26-NEXT: movzbl %cl, %edi -; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %eax, %edx -; FALLBACK26-NEXT: notb %dl -; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK26-NEXT: movl %eax, %ecx -; FALLBACK26-NEXT: orl %ebx, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK26-NEXT: leal (%esi,%esi), %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: sarl $31, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK26-NEXT: movl %ebx, %eax +; FALLBACK26-NEXT: andb $28, %dl +; FALLBACK26-NEXT: movzbl %dl, %ecx +; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK26-NEXT: movl %eax, %ebp +; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %ecx, %eax -; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK26-NEXT: addl %ebx, %ebx -; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK26-NEXT: orl %ebp, %ebx -; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK26-NEXT: sarxl %eax, %edi, %eax -; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %edx, %edi, %edi -; FALLBACK26-NEXT: orl %ecx, %edi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK26-NEXT: orl %esi, %ecx +; FALLBACK26-NEXT: leal (%eax,%eax), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: leal (%eax,%eax), %edx +; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK26-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %edx, %edx +; FALLBACK26-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK26-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK26-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK26-NEXT: movl %ebp, %edx +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK26-NEXT: leal (%esi,%esi), %ebp +; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: orl %edi, %ebp +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK26-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: orl %eax, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %edx, %eax +; FALLBACK26-NEXT: sarxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK26-NEXT: movl %eax, 28(%edx) -; FALLBACK26-NEXT: movl %ecx, 4(%edx) -; FALLBACK26-NEXT: movl %edi, 24(%edx) -; FALLBACK26-NEXT: movl %ebx, 16(%edx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl %ecx, 28(%edx) +; FALLBACK26-NEXT: movl %eax, 4(%edx) +; FALLBACK26-NEXT: movl %esi, 24(%edx) +; FALLBACK26-NEXT: movl %edi, 16(%edx) +; FALLBACK26-NEXT: movl %ebp, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -10822,82 +10810,82 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK30-NEXT: movl 16(%ecx), %esi ; FALLBACK30-NEXT: movl 20(%ecx), %edi -; FALLBACK30-NEXT: movl 24(%ecx), %ebx -; FALLBACK30-NEXT: movl 28(%ecx), %edx -; FALLBACK30-NEXT: movzbl (%eax), %ecx -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shlb $3, %al -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl 24(%ecx), %ebp +; FALLBACK30-NEXT: movl 28(%ecx), %ecx +; FALLBACK30-NEXT: movzbl (%eax), %edx +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: shlb $3, %bl +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: sarl $31, %edx -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: andb $28, %cl -; FALLBACK30-NEXT: movzbl %cl, %edi -; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %eax, %edx -; FALLBACK30-NEXT: notb %dl -; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi -; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx -; FALLBACK30-NEXT: movl %eax, %ecx -; FALLBACK30-NEXT: orl %ebx, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi -; FALLBACK30-NEXT: leal (%esi,%esi), %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax -; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx -; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: sarl $31, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK30-NEXT: movl %ebx, %eax +; FALLBACK30-NEXT: andb $28, %dl +; FALLBACK30-NEXT: movzbl %dl, %ecx +; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%ecx), %edx +; FALLBACK30-NEXT: movl %eax, %ebp +; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: movl 36(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %ecx, %eax -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; FALLBACK30-NEXT: addl %ebx, %ebx -; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx -; FALLBACK30-NEXT: orl %ebp, %ebx -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx -; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi -; FALLBACK30-NEXT: sarxl %eax, %edi, %eax -; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %edx, %edi, %edi -; FALLBACK30-NEXT: orl %ecx, %edi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: addl %ecx, %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx -; FALLBACK30-NEXT: orl %esi, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx -; FALLBACK30-NEXT: movl %eax, 28(%edx) -; FALLBACK30-NEXT: movl %ecx, 4(%edx) -; FALLBACK30-NEXT: movl %edi, 24(%edx) -; FALLBACK30-NEXT: movl %ebx, 16(%edx) +; FALLBACK30-NEXT: leal (%eax,%eax), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: leal (%eax,%eax), %edx +; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK30-NEXT: movl 44(%esp,%ecx), %edx +; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %edx, %edx +; FALLBACK30-NEXT: shlxl %ebx, %edx, %edi +; FALLBACK30-NEXT: movl 40(%esp,%ecx), %edx +; FALLBACK30-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ebp, %edx, %esi +; FALLBACK30-NEXT: movl %ebp, %edx +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 56(%esp,%ecx), %esi +; FALLBACK30-NEXT: leal (%esi,%esi), %ebp +; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: movl 52(%esp,%ecx), %eax +; FALLBACK30-NEXT: shrxl %edx, %eax, %edi +; FALLBACK30-NEXT: orl %edi, %ebp +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi +; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 60(%esp,%ecx), %ecx +; FALLBACK30-NEXT: leal (%ecx,%ecx), %esi +; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: orl %eax, %esi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %ebx, %eax, %eax +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %edx, %eax +; FALLBACK30-NEXT: sarxl %ebx, %ecx, %ecx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %ecx, 28(%edx) +; FALLBACK30-NEXT: movl %eax, 4(%edx) +; FALLBACK30-NEXT: movl %esi, 24(%edx) +; FALLBACK30-NEXT: movl %edi, 16(%edx) +; FALLBACK30-NEXT: movl %ebp, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -11104,30 +11092,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andb $6, %sil -; FALLBACK2-NEXT: movzbl %sil, %ecx -; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi -; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi -; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 -; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 -; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK2-NEXT: movzbl %sil, %esi +; FALLBACK2-NEXT: movq -64(%rsp,%rsi,4), %rdi +; FALLBACK2-NEXT: movq -56(%rsp,%rsi,4), %r8 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK2-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 +; FALLBACK2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK2-NEXT: orq %r8, %rdi -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi -; FALLBACK2-NEXT: orq %r9, %rsi -; FALLBACK2-NEXT: addq %rcx, %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax -; FALLBACK2-NEXT: orq %r10, %rax -; FALLBACK2-NEXT: movq %r11, 24(%rdx) +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK2-NEXT: leaq (%rsi,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %rax +; FALLBACK2-NEXT: orq %r8, %rax +; FALLBACK2-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK2-NEXT: movq %rcx, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) -; FALLBACK2-NEXT: movq %rsi, (%rdx) -; FALLBACK2-NEXT: movq %rdi, 8(%rdx) +; FALLBACK2-NEXT: movq %rdi, (%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes_dwordOff: @@ -11268,30 +11256,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK6-NEXT: movl %eax, %ecx ; FALLBACK6-NEXT: andb $6, %sil -; FALLBACK6-NEXT: movzbl %sil, %ecx -; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK6-NEXT: movzbl %sil, %esi +; FALLBACK6-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK6-NEXT: notb %al -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK6-NEXT: orq %rsi, %rdi -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK6-NEXT: orq %r9, %rcx -; FALLBACK6-NEXT: addq %r8, %r8 -; FALLBACK6-NEXT: shlxq %rax, %r8, %rax -; FALLBACK6-NEXT: orq %r10, %rax -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK6-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK6-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK6-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %rdi, %r10 +; FALLBACK6-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK6-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK6-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %rdi, %r11 +; FALLBACK6-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rax, %r9, %rax +; FALLBACK6-NEXT: orq %rdi, %rax +; FALLBACK6-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK6-NEXT: movq %rcx, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) -; FALLBACK6-NEXT: movq %rcx, 16(%rdx) -; FALLBACK6-NEXT: movq %rdi, (%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes_dwordOff: @@ -11431,30 +11419,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK10-NEXT: movl %eax, %ecx ; FALLBACK10-NEXT: andb $6, %sil -; FALLBACK10-NEXT: movzbl %sil, %ecx -; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK10-NEXT: movzbl %sil, %esi +; FALLBACK10-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK10-NEXT: notb %al -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK10-NEXT: orq %rsi, %rdi -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK10-NEXT: orq %r9, %rcx -; FALLBACK10-NEXT: addq %r8, %r8 -; FALLBACK10-NEXT: shlxq %rax, %r8, %rax -; FALLBACK10-NEXT: orq %r10, %rax -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK10-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK10-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK10-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %rdi, %r10 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK10-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rdi, %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %rdi, %rax +; FALLBACK10-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) -; FALLBACK10-NEXT: movq %rcx, 16(%rdx) -; FALLBACK10-NEXT: movq %rdi, (%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes_dwordOff: @@ -11594,30 +11582,30 @@ define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andb $6, %sil -; FALLBACK14-NEXT: movzbl %sil, %ecx -; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi -; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi -; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 -; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 -; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx -; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 -; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 -; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax +; FALLBACK14-NEXT: movzbl %sil, %esi +; FALLBACK14-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi ; FALLBACK14-NEXT: notb %al -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi -; FALLBACK14-NEXT: orq %rsi, %rdi -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx -; FALLBACK14-NEXT: orq %r9, %rcx -; FALLBACK14-NEXT: addq %r8, %r8 -; FALLBACK14-NEXT: shlxq %rax, %r8, %rax -; FALLBACK14-NEXT: orq %r10, %rax -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq -64(%rsp,%rsi,4), %r8 +; FALLBACK14-NEXT: movq -56(%rsp,%rsi,4), %r9 +; FALLBACK14-NEXT: leaq (%r8,%r8), %r10 +; FALLBACK14-NEXT: shlxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %rdi, %r10 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rdi +; FALLBACK14-NEXT: movq -48(%rsp,%rsi,4), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rdi, %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r8, %rdi +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %rdi, %rax +; FALLBACK14-NEXT: sarxq %rcx, %rsi, %rcx +; FALLBACK14-NEXT: movq %rcx, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rcx, 16(%rdx) -; FALLBACK14-NEXT: movq %rdi, (%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes_dwordOff: @@ -12204,10 +12192,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: lshr_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -12235,60 +12221,58 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: movl %ecx, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 +; FALLBACK2-NEXT: notb %cl +; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK2-NEXT: orq %r10, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 +; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 +; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 +; FALLBACK2-NEXT: orq %rbx, %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK2-NEXT: orq %rbx, %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK2-NEXT: movq %rax, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 48(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r12, 40(%rdx) +; FALLBACK2-NEXT: movq %r11, 16(%rdx) +; FALLBACK2-NEXT: movq %r9, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_64bytes: @@ -12512,13 +12496,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: lshr_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 @@ -12533,62 +12515,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: movl %ecx, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK6-NEXT: notb %cl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r8, %rdi +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK6-NEXT: addq %r11, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK6-NEXT: orq %rbx, %r11 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK6-NEXT: orq %r15, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK6-NEXT: addq %rbx, %rbx +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK6-NEXT: orq %r14, %rbx +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK6-NEXT: orq %r14, %r15 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK6-NEXT: orq %r10, %rcx +; FALLBACK6-NEXT: shrxq %rsi, %rax, %rax +; FALLBACK6-NEXT: movq %rax, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: movq %r15, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 32(%rdx) +; FALLBACK6-NEXT: movq %r13, 40(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_64bytes: @@ -12749,43 +12729,43 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: movl (%rsi), %edi ; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx +; FALLBACK9-NEXT: leal (,%rdi,8), %ecx ; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK9-NEXT: andl $56, %edi +; FALLBACK9-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK9-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq %r9, %rax +; FALLBACK9-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK9-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK9-NEXT: movq %r10, %r8 ; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK9-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: movq %r11, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK9-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK9-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK9-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK9-NEXT: movq %rdi, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) ; FALLBACK9-NEXT: movq %r9, 48(%rdx) ; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) +; FALLBACK9-NEXT: movq %rsi, 32(%rdx) ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %rax, 24(%rdx) ; FALLBACK9-NEXT: movq %r14, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 @@ -12795,77 +12775,73 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: lshr_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movl (%rsi), %esi ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 -; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp -; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) -; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -120(%rsp,%rsi), %r10 +; FALLBACK10-NEXT: movq -112(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK10-NEXT: orq %r8, %rdi +; FALLBACK10-NEXT: movq -104(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shrxq %rcx, %r11, %rbx +; FALLBACK10-NEXT: movq -96(%rsp,%rsi), %r14 +; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK10-NEXT: shlxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: shrxq %rcx, %r9, %rbx +; FALLBACK10-NEXT: addq %r11, %r11 +; FALLBACK10-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %rbx, %r11 +; FALLBACK10-NEXT: movq -88(%rsp,%rsi), %rbx +; FALLBACK10-NEXT: shrxq %rcx, %rbx, %r15 +; FALLBACK10-NEXT: movq -80(%rsp,%rsi), %r12 +; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK10-NEXT: shlxq %rax, %r13, %r13 +; FALLBACK10-NEXT: orq %r15, %r13 +; FALLBACK10-NEXT: shrxq %rcx, %r14, %r14 +; FALLBACK10-NEXT: addq %rbx, %rbx +; FALLBACK10-NEXT: shlxq %rax, %rbx, %rbx +; FALLBACK10-NEXT: orq %r14, %rbx +; FALLBACK10-NEXT: shrxq %rcx, %r12, %r14 +; FALLBACK10-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK10-NEXT: leaq (%rsi,%rsi), %r15 +; FALLBACK10-NEXT: shlxq %rax, %r15, %r15 +; FALLBACK10-NEXT: orq %r14, %r15 +; FALLBACK10-NEXT: shrxq %rcx, %r10, %r10 +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rax, %r9, %rax +; FALLBACK10-NEXT: orq %r10, %rax +; FALLBACK10-NEXT: shrxq %rcx, %rsi, %rcx +; FALLBACK10-NEXT: movq %rcx, 56(%rdx) +; FALLBACK10-NEXT: movq %rax, 8(%rdx) +; FALLBACK10-NEXT: movq %r15, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 32(%rdx) +; FALLBACK10-NEXT: movq %r13, 40(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -12930,45 +12906,45 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: pushq %rax ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK12-NEXT: movl (%rsi), %r9d +; FALLBACK12-NEXT: movl (%rsi), %r10d ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK12-NEXT: leal (,%r9,8), %eax +; FALLBACK12-NEXT: leal (,%r10,8), %eax ; FALLBACK12-NEXT: andl $56, %eax -; FALLBACK12-NEXT: andl $56, %r9d -; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 +; FALLBACK12-NEXT: andl $56, %r10d +; FALLBACK12-NEXT: movq -128(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq -120(%rsp,%r10), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi -; FALLBACK12-NEXT: orq %r10, %rdi -; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 -; FALLBACK12-NEXT: movq %r10, %rbx +; FALLBACK12-NEXT: orq %r9, %rdi +; FALLBACK12-NEXT: movq -104(%rsp,%r10), %r9 +; FALLBACK12-NEXT: movq %r9, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbx -; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 +; FALLBACK12-NEXT: movq -96(%rsp,%r10), %r12 ; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 ; FALLBACK12-NEXT: orq %rbx, %r11 -; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx +; FALLBACK12-NEXT: movq -112(%rsp,%r10), %rbx ; FALLBACK12-NEXT: movq %rbx, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r14 -; FALLBACK12-NEXT: addq %r10, %r10 +; FALLBACK12-NEXT: addq %r9, %r9 ; FALLBACK12-NEXT: movl %esi, %ecx -; FALLBACK12-NEXT: shlq %cl, %r10 -; FALLBACK12-NEXT: orq %r14, %r10 -; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 +; FALLBACK12-NEXT: shlq %cl, %r9 +; FALLBACK12-NEXT: orq %r14, %r9 +; FALLBACK12-NEXT: movq -88(%rsp,%r10), %r14 ; FALLBACK12-NEXT: movq %r14, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r13 -; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp +; FALLBACK12-NEXT: movq -80(%rsp,%r10), %rbp ; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r15 @@ -12981,8 +12957,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: orq %r12, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp -; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 -; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 +; FALLBACK12-NEXT: movq -72(%rsp,%r10), %r10 +; FALLBACK12-NEXT: leaq (%r10,%r10), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: orq %rbp, %r12 @@ -12993,13 +12969,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r8, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx -; FALLBACK12-NEXT: shrq %cl, %r9 -; FALLBACK12-NEXT: movq %r9, 56(%rdx) +; FALLBACK12-NEXT: shrq %cl, %r10 +; FALLBACK12-NEXT: movq %r10, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) ; FALLBACK12-NEXT: movq %r12, 48(%rdx) ; FALLBACK12-NEXT: movq %r14, 32(%rdx) ; FALLBACK12-NEXT: movq %r15, 40(%rdx) -; FALLBACK12-NEXT: movq %r10, 16(%rdx) +; FALLBACK12-NEXT: movq %r9, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: addq $8, %rsp @@ -13062,74 +13038,70 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: lshr_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rsi,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: leal (,%rsi,8), %eax +; FALLBACK14-NEXT: andl $56, %eax +; FALLBACK14-NEXT: movl %eax, %ecx ; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax -; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10 -; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15 -; FALLBACK14-NEXT: movl %ecx, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi +; FALLBACK14-NEXT: orq %r8, %rdi +; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shrxq %rcx, %r11, %rbx +; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r14 +; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK14-NEXT: shlxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: shrxq %rcx, %r9, %rbx +; FALLBACK14-NEXT: addq %r11, %r11 +; FALLBACK14-NEXT: shlxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %rbx, %r11 +; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %rbx +; FALLBACK14-NEXT: shrxq %rcx, %rbx, %r15 ; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 -; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp +; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK14-NEXT: shlxq %rax, %r13, %r13 +; FALLBACK14-NEXT: orq %r15, %r13 +; FALLBACK14-NEXT: shrxq %rcx, %r14, %r14 +; FALLBACK14-NEXT: addq %rbx, %rbx +; FALLBACK14-NEXT: shlxq %rax, %rbx, %rbx +; FALLBACK14-NEXT: orq %r14, %rbx +; FALLBACK14-NEXT: shrxq %rcx, %r12, %r14 ; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi +; FALLBACK14-NEXT: leaq (%rsi,%rsi), %r15 +; FALLBACK14-NEXT: shlxq %rax, %r15, %r15 +; FALLBACK14-NEXT: orq %r14, %r15 +; FALLBACK14-NEXT: shrxq %rcx, %r10, %r10 +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rax, %r9, %rax +; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rsi, %rsi -; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi -; FALLBACK14-NEXT: orq %r13, %rsi -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %rbp, %rax ; FALLBACK14-NEXT: movq %rcx, 56(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) -; FALLBACK14-NEXT: movq %rsi, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: movq %r15, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 32(%rdx) +; FALLBACK14-NEXT: movq %r13, 40(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -13139,40 +13111,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: movl (%rsi), %edi ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx +; FALLBACK15-NEXT: leal (,%rdi,8), %ecx ; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK15-NEXT: andl $56, %edi +; FALLBACK15-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK15-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq %r9, %rax +; FALLBACK15-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK15-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK15-NEXT: movq %r10, %r8 ; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK15-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: movq %r11, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK15-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK15-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK15-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK15-NEXT: movq %rdi, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) +; FALLBACK15-NEXT: movq %rsi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx @@ -13618,14 +13590,15 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 36(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%eax), %ebp -; FALLBACK18-NEXT: movl 44(%eax), %ebx +; FALLBACK18-NEXT: movl 40(%eax), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 44(%eax), %ebp ; FALLBACK18-NEXT: movl 48(%eax), %edi ; FALLBACK18-NEXT: movl 52(%eax), %esi ; FALLBACK18-NEXT: movl 56(%eax), %edx ; FALLBACK18-NEXT: movl 60(%eax), %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl (%eax), %eax +; FALLBACK18-NEXT: movl (%eax), %ebx ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) @@ -13634,136 +13607,138 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: movl %eax, %ecx -; FALLBACK18-NEXT: leal (,%eax,8), %edx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK18-NEXT: leal (,%ebx,8), %edx ; FALLBACK18-NEXT: andl $24, %edx -; FALLBACK18-NEXT: andl $60, %ecx -; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl %edx, %ecx +; FALLBACK18-NEXT: andl $60, %ebx +; FALLBACK18-NEXT: movl 68(%esp,%ebx), %esi +; FALLBACK18-NEXT: movl 72(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %edi -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%eax,%eax), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 88(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 84(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK18-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 92(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 104(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax +; FALLBACK18-NEXT: movl 100(%esp,%ebx), %edi +; FALLBACK18-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax +; FALLBACK18-NEXT: movl 112(%esp,%ebx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK18-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ecx, %ebp ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %ecx +; FALLBACK18-NEXT: orl %eax, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 116(%esp,%ebx), %eax +; FALLBACK18-NEXT: movl %ebp, %ecx +; FALLBACK18-NEXT: shrxl %ebp, %eax, %ebp +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ecx, %ebp ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: movl 124(%esp,%ebx), %eax +; FALLBACK18-NEXT: leal (%eax,%eax), %ebx +; FALLBACK18-NEXT: shlxl %edx, %ebx, %edx +; FALLBACK18-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl %edi, 60(%eax) +; FALLBACK18-NEXT: movl %edx, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 48(%eax) +; FALLBACK18-NEXT: movl %esi, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14284,7 +14259,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %ecx +; FALLBACK22-NEXT: movl (%eax), %ebx ; FALLBACK22-NEXT: xorps %xmm4, %xmm4 ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) @@ -14294,112 +14269,114 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%ecx,8), %edx +; FALLBACK22-NEXT: leal (,%ebx,8), %edx ; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: andl $60, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl %edx, %ecx +; FALLBACK22-NEXT: andl $60, %ebx +; FALLBACK22-NEXT: movl 68(%esp,%ebx), %esi +; FALLBACK22-NEXT: movl 72(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %edi -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: leal (%eax,%eax), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 76(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 88(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 84(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl 96(%esp,%ebx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 92(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 104(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax +; FALLBACK22-NEXT: movl 100(%esp,%ebx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx -; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi +; FALLBACK22-NEXT: movl 112(%esp,%ebx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: leal (%eax,%eax), %esi +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: movl 108(%esp,%ebx), %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ecx, %ebp ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %ecx +; FALLBACK22-NEXT: orl %eax, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%ebx), %edi +; FALLBACK22-NEXT: leal (%edi,%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 116(%esp,%ebx), %eax +; FALLBACK22-NEXT: movl %ebp, %ecx +; FALLBACK22-NEXT: shrxl %ebp, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %ebp ; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 124(%esp,%ebx), %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %ebx +; FALLBACK22-NEXT: shlxl %edx, %ebx, %edx +; FALLBACK22-NEXT: shrxl %ebp, %edi, %edi +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl %edi, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 48(%eax) +; FALLBACK22-NEXT: movl %esi, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14873,109 +14850,107 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: leal (,%ecx,8), %edx ; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %edi -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: leal (%eax,%eax), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK26-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK26-NEXT: orl %edi, %ebp ; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %esi ; FALLBACK26-NEXT: orl %edi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi +; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: orl %eax, %ebp +; FALLBACK26-NEXT: movl 120(%esp,%ecx), %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: addl %eax, %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %esi ; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi +; FALLBACK26-NEXT: shrxl %ebx, %eax, %edi ; FALLBACK26-NEXT: orl %edi, %esi -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax +; FALLBACK26-NEXT: shlxl %edx, %eax, %eax +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx -; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx -; FALLBACK26-NEXT: addl %ecx, %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edx +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %edi ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK26-NEXT: movl %edx, 60(%ecx) -; FALLBACK26-NEXT: movl %ebx, 56(%ecx) -; FALLBACK26-NEXT: movl %edi, 48(%ecx) +; FALLBACK26-NEXT: movl %edi, 60(%ecx) +; FALLBACK26-NEXT: movl %edx, 56(%ecx) +; FALLBACK26-NEXT: movl %eax, 48(%ecx) ; FALLBACK26-NEXT: movl %esi, 52(%ecx) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: movl %eax, 40(%ecx) +; FALLBACK26-NEXT: movl %ebp, 40(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 44(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -15430,115 +15405,113 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK30-NEXT: movl (%eax), %edx +; FALLBACK30-NEXT: movl (%eax), %ecx ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%edx,8), %ecx -; FALLBACK30-NEXT: andl $24, %ecx -; FALLBACK30-NEXT: andl $60, %edx -; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi -; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax +; FALLBACK30-NEXT: leal (,%ecx,8), %edx +; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: andl $60, %ecx +; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi +; FALLBACK30-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi -; FALLBACK30-NEXT: movl %ecx, %ebx -; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: leal (%eax,%eax), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp +; FALLBACK30-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK30-NEXT: orl %edi, %ebp ; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi +; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi +; FALLBACK30-NEXT: shlxl %edx, %esi, %esi ; FALLBACK30-NEXT: orl %edi, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi +; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi -; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax +; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax +; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi -; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax +; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi -; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax -; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi +; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: orl %eax, %ebp +; FALLBACK30-NEXT: movl 120(%esp,%ecx), %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: addl %eax, %eax +; FALLBACK30-NEXT: shlxl %edx, %eax, %esi +; FALLBACK30-NEXT: movl 116(%esp,%ecx), %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %edi ; FALLBACK30-NEXT: orl %edi, %esi -; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax -; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx -; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp -; FALLBACK30-NEXT: leal (%edx,%edx), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx -; FALLBACK30-NEXT: orl %eax, %edx +; FALLBACK30-NEXT: shlxl %edx, %eax, %eax +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl 124(%esp,%ecx), %ecx +; FALLBACK30-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edx +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %edi ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK30-NEXT: movl %ebp, 60(%ecx) +; FALLBACK30-NEXT: movl %edi, 60(%ecx) ; FALLBACK30-NEXT: movl %edx, 56(%ecx) -; FALLBACK30-NEXT: movl %edi, 48(%ecx) +; FALLBACK30-NEXT: movl %eax, 48(%ecx) ; FALLBACK30-NEXT: movl %esi, 52(%ecx) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: movl %eax, 40(%ecx) +; FALLBACK30-NEXT: movl %ebp, 40(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 44(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -16196,10 +16169,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: shl_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -16227,62 +16198,60 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: andl $56, %eax +; FALLBACK2-NEXT: movl %eax, %ecx ; FALLBACK2-NEXT: andl $56, %esi ; FALLBACK2-NEXT: negl %esi ; FALLBACK2-NEXT: movslq %esi, %rsi -; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 -; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 -; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi -; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 -; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 -; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 -; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 -; FALLBACK2-NEXT: movl %eax, %r13d -; FALLBACK2-NEXT: notb %r13b -; FALLBACK2-NEXT: shrq %r10 -; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp -; FALLBACK2-NEXT: shrq %r14 -; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 -; FALLBACK2-NEXT: orq %r11, %r14 -; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi -; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax -; FALLBACK2-NEXT: shrq %rcx -; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx -; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r9 +; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %r8 +; FALLBACK2-NEXT: notb %al +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r10 ; FALLBACK2-NEXT: shrq %r9 -; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 -; FALLBACK2-NEXT: orq %r15, %r9 +; FALLBACK2-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK2-NEXT: orq %r8, %r9 +; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %rbx +; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r14 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK2-NEXT: orq %rbx, %r8 ; FALLBACK2-NEXT: shrq %rdi -; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi -; FALLBACK2-NEXT: orq %rbp, %rdi +; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi +; FALLBACK2-NEXT: orq %r14, %rdi +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rbx +; FALLBACK2-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r15 +; FALLBACK2-NEXT: shlxq %rcx, %r15, %r12 +; FALLBACK2-NEXT: shrq %r15 +; FALLBACK2-NEXT: shrxq %rax, %r15, %r15 +; FALLBACK2-NEXT: orq %r14, %r15 +; FALLBACK2-NEXT: shrq %r11 +; FALLBACK2-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK2-NEXT: orq %r12, %r11 +; FALLBACK2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi +; FALLBACK2-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK2-NEXT: shrq %rsi -; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 -; FALLBACK2-NEXT: orq %rax, %r8 -; FALLBACK2-NEXT: movq %r12, (%rdx) -; FALLBACK2-NEXT: movq %r8, 48(%rdx) +; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi +; FALLBACK2-NEXT: orq %r14, %rsi +; FALLBACK2-NEXT: shrq %rbx +; FALLBACK2-NEXT: shrxq %rax, %rbx, %rax +; FALLBACK2-NEXT: orq %rcx, %rax +; FALLBACK2-NEXT: movq %r10, (%rdx) +; FALLBACK2-NEXT: movq %rax, 48(%rdx) ; FALLBACK2-NEXT: movq %rsi, 56(%rdx) -; FALLBACK2-NEXT: movq %rdi, 32(%rdx) -; FALLBACK2-NEXT: movq %r9, 40(%rdx) -; FALLBACK2-NEXT: movq %rcx, 16(%rdx) -; FALLBACK2-NEXT: movq %r14, 24(%rdx) -; FALLBACK2-NEXT: movq %r10, 8(%rdx) +; FALLBACK2-NEXT: movq %r11, 32(%rdx) +; FALLBACK2-NEXT: movq %r15, 40(%rdx) +; FALLBACK2-NEXT: movq %rdi, 16(%rdx) +; FALLBACK2-NEXT: movq %r8, 24(%rdx) +; FALLBACK2-NEXT: movq %r9, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_64bytes: @@ -16509,86 +16478,81 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: shl_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 -; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: subq $24, %rsp +; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 -; FALLBACK6-NEXT: movl (%rsi), %eax +; FALLBACK6-NEXT: movl (%rsi), %esi ; FALLBACK6-NEXT: xorps %xmm4, %xmm4 ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: movaps %xmm3, (%rsp) +; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %ecx -; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: negl %eax -; FALLBACK6-NEXT: movslq %eax, %rsi -; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK6-NEXT: movl %ecx, %r9d -; FALLBACK6-NEXT: notb %r9b +; FALLBACK6-NEXT: movl %eax, %ecx +; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: negl %esi +; FALLBACK6-NEXT: movslq %esi, %rsi +; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK6-NEXT: notb %al +; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK6-NEXT: shrq %r8 +; FALLBACK6-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK6-NEXT: orq %r9, %r8 +; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK6-NEXT: shrq %r9 +; FALLBACK6-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK6-NEXT: orq %r10, %r9 +; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK6-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK6-NEXT: shrq %r10 +; FALLBACK6-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK6-NEXT: orq %r11, %r10 +; FALLBACK6-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK6-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK6-NEXT: shrq %r11 +; FALLBACK6-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK6-NEXT: orq %r14, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK6-NEXT: shrq %rbx +; FALLBACK6-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK6-NEXT: orq %r15, %rbx +; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK6-NEXT: shrq %rdi -; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %r12, %rdi -; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK6-NEXT: shrq %r13 -; FALLBACK6-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK6-NEXT: orq %r15, %r12 -; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK6-NEXT: shrq %r11 -; FALLBACK6-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK6-NEXT: shrq %r14 -; FALLBACK6-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK6-NEXT: orq %r10, %r14 -; FALLBACK6-NEXT: shrq %rsi -; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK6-NEXT: orq %rbx, %rsi -; FALLBACK6-NEXT: shrq %rax -; FALLBACK6-NEXT: shrxq %r9, %rax, %rax -; FALLBACK6-NEXT: orq %r8, %rax -; FALLBACK6-NEXT: shrq %rbp -; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK6-NEXT: orq %r15, %r8 -; FALLBACK6-NEXT: movq %rcx, (%rdx) -; FALLBACK6-NEXT: movq %r8, 56(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %rsi, 8(%rdx) -; FALLBACK6-NEXT: movq %r14, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r12, 32(%rdx) -; FALLBACK6-NEXT: movq %rdi, 40(%rdx) -; FALLBACK6-NEXT: addq $24, %rsp +; FALLBACK6-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK6-NEXT: shrq %r15 +; FALLBACK6-NEXT: shrxq %rax, %r15, %rax +; FALLBACK6-NEXT: orq %rcx, %rax +; FALLBACK6-NEXT: movq %r14, (%rdx) +; FALLBACK6-NEXT: movq %rax, 56(%rdx) +; FALLBACK6-NEXT: movq %rdi, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 8(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r10, 24(%rdx) +; FALLBACK6-NEXT: movq %r9, 32(%rdx) +; FALLBACK6-NEXT: movq %r8, 40(%rdx) +; FALLBACK6-NEXT: addq $8, %rsp ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 -; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_64bytes: @@ -16798,80 +16762,75 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: shl_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 -; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: subq $24, %rsp +; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK10-NEXT: movl (%rsi), %eax +; FALLBACK10-NEXT: movl (%rsi), %esi ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %ecx -; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: negl %eax -; FALLBACK10-NEXT: movslq %eax, %rsi -; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK10-NEXT: movl %ecx, %r9d -; FALLBACK10-NEXT: notb %r9b +; FALLBACK10-NEXT: movl %eax, %ecx +; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: negl %esi +; FALLBACK10-NEXT: movslq %esi, %rsi +; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK10-NEXT: notb %al +; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK10-NEXT: shrq %r8 +; FALLBACK10-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK10-NEXT: orq %r9, %r8 +; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK10-NEXT: shrq %r9 +; FALLBACK10-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK10-NEXT: orq %r10, %r9 +; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK10-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK10-NEXT: shrq %r10 +; FALLBACK10-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK10-NEXT: orq %r11, %r10 +; FALLBACK10-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK10-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK10-NEXT: shrq %r11 +; FALLBACK10-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK10-NEXT: orq %r14, %r11 +; FALLBACK10-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK10-NEXT: shrq %rbx +; FALLBACK10-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK10-NEXT: orq %r15, %rbx +; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK10-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK10-NEXT: shrq %rdi -; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %r12, %rdi -; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK10-NEXT: shrq %r13 -; FALLBACK10-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK10-NEXT: orq %r15, %r12 -; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK10-NEXT: shrq %r11 -; FALLBACK10-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK10-NEXT: shrq %r14 -; FALLBACK10-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK10-NEXT: orq %r10, %r14 -; FALLBACK10-NEXT: shrq %rsi -; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK10-NEXT: orq %rbx, %rsi -; FALLBACK10-NEXT: shrq %rax -; FALLBACK10-NEXT: shrxq %r9, %rax, %rax -; FALLBACK10-NEXT: orq %r8, %rax -; FALLBACK10-NEXT: shrq %rbp -; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK10-NEXT: orq %r15, %r8 -; FALLBACK10-NEXT: movq %rcx, (%rdx) -; FALLBACK10-NEXT: movq %r8, 56(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %rsi, 8(%rdx) -; FALLBACK10-NEXT: movq %r14, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r12, 32(%rdx) -; FALLBACK10-NEXT: movq %rdi, 40(%rdx) -; FALLBACK10-NEXT: addq $24, %rsp +; FALLBACK10-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK10-NEXT: shrq %r15 +; FALLBACK10-NEXT: shrxq %rax, %r15, %rax +; FALLBACK10-NEXT: orq %rcx, %rax +; FALLBACK10-NEXT: movq %r14, (%rdx) +; FALLBACK10-NEXT: movq %rax, 56(%rdx) +; FALLBACK10-NEXT: movq %rdi, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 8(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r10, 24(%rdx) +; FALLBACK10-NEXT: movq %r9, 32(%rdx) +; FALLBACK10-NEXT: movq %r8, 40(%rdx) +; FALLBACK10-NEXT: addq $8, %rsp ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 -; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -17071,77 +17030,72 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: shl_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 -; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: subq $24, %rsp +; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK14-NEXT: movl (%rsi), %eax +; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: negl %eax -; FALLBACK14-NEXT: movslq %eax, %rsi -; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12 -; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15 -; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13 -; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8 -; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11 -; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10 -; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14 -; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx -; FALLBACK14-NEXT: movl %ecx, %r9d -; FALLBACK14-NEXT: notb %r9b +; FALLBACK14-NEXT: movl %eax, %ecx +; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: negl %esi +; FALLBACK14-NEXT: movslq %esi, %rsi +; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r9 +; FALLBACK14-NEXT: notb %al +; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r10 +; FALLBACK14-NEXT: shrq %r8 +; FALLBACK14-NEXT: shrxq %rax, %r8, %r8 +; FALLBACK14-NEXT: orq %r9, %r8 +; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %r11 +; FALLBACK14-NEXT: shrq %r9 +; FALLBACK14-NEXT: shrxq %rax, %r9, %r9 +; FALLBACK14-NEXT: orq %r10, %r9 +; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %r10 +; FALLBACK14-NEXT: shlxq %rcx, %r10, %r14 +; FALLBACK14-NEXT: shrq %r10 +; FALLBACK14-NEXT: shrxq %rax, %r10, %r10 +; FALLBACK14-NEXT: orq %r11, %r10 +; FALLBACK14-NEXT: movq -64(%rsp,%rsi), %rbx +; FALLBACK14-NEXT: movq -56(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r15 +; FALLBACK14-NEXT: shrq %r11 +; FALLBACK14-NEXT: shrxq %rax, %r11, %r11 +; FALLBACK14-NEXT: orq %r14, %r11 +; FALLBACK14-NEXT: shlxq %rcx, %rbx, %r14 +; FALLBACK14-NEXT: shrq %rbx +; FALLBACK14-NEXT: shrxq %rax, %rbx, %rbx +; FALLBACK14-NEXT: orq %r15, %rbx +; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %r15 +; FALLBACK14-NEXT: shlxq %rcx, %r15, %r12 ; FALLBACK14-NEXT: shrq %rdi -; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi +; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %r12, %rdi -; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp -; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8 -; FALLBACK14-NEXT: shrq %r13 -; FALLBACK14-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK14-NEXT: orq %r15, %r12 -; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 -; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi -; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx -; FALLBACK14-NEXT: shrq %r11 -; FALLBACK14-NEXT: shrxq %r9, %r11, %r11 -; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; FALLBACK14-NEXT: shrq %r14 -; FALLBACK14-NEXT: shrxq %r9, %r14, %r14 -; FALLBACK14-NEXT: orq %r10, %r14 -; FALLBACK14-NEXT: shrq %rsi -; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi -; FALLBACK14-NEXT: orq %rbx, %rsi -; FALLBACK14-NEXT: shrq %rax -; FALLBACK14-NEXT: shrxq %r9, %rax, %rax -; FALLBACK14-NEXT: orq %r8, %rax -; FALLBACK14-NEXT: shrq %rbp -; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK14-NEXT: orq %r15, %r8 -; FALLBACK14-NEXT: movq %rcx, (%rdx) -; FALLBACK14-NEXT: movq %r8, 56(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %rsi, 8(%rdx) -; FALLBACK14-NEXT: movq %r14, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r12, 32(%rdx) -; FALLBACK14-NEXT: movq %rdi, 40(%rdx) -; FALLBACK14-NEXT: addq $24, %rsp +; FALLBACK14-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx +; FALLBACK14-NEXT: shrq %r15 +; FALLBACK14-NEXT: shrxq %rax, %r15, %rax +; FALLBACK14-NEXT: orq %rcx, %rax +; FALLBACK14-NEXT: movq %r14, (%rdx) +; FALLBACK14-NEXT: movq %rax, 56(%rdx) +; FALLBACK14-NEXT: movq %rdi, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 8(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r10, 24(%rdx) +; FALLBACK14-NEXT: movq %r9, 32(%rdx) +; FALLBACK14-NEXT: movq %r8, 40(%rdx) +; FALLBACK14-NEXT: addq $8, %rsp ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 -; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -17681,144 +17635,149 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK18-NEXT: leal (,%ebp,8), %edx -; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: leal (,%ebp,8), %ebx +; FALLBACK18-NEXT: andl $24, %ebx +; FALLBACK18-NEXT: movl %ebx, %eax ; FALLBACK18-NEXT: andl $60, %ebp ; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK18-NEXT: subl %ebp, %edi -; FALLBACK18-NEXT: movl (%edi), %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 4(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: subl %ebp, %edx +; FALLBACK18-NEXT: movl (%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 4(%edx), %ecx ; FALLBACK18-NEXT: notb %bl -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK18-NEXT: orl %ecx, %esi +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK18-NEXT: shlxl %eax, %ecx, %esi +; FALLBACK18-NEXT: movl %eax, %ebp +; FALLBACK18-NEXT: orl %esi, %edi +; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 8(%edx), %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 8(%edi), %esi -; FALLBACK18-NEXT: movl %esi, %ecx -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 12(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, %esi, %esi -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %esi, %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 16(%edi), %eax -; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 20(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 12(%edx), %esi +; FALLBACK18-NEXT: movl %ebp, %edi +; FALLBACK18-NEXT: shlxl %ebp, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: orl %eax, %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 24(%edi), %ecx +; FALLBACK18-NEXT: movl 16(%edx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 28(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: movl 20(%edx), %ecx +; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 32(%edi), %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: movl 36(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK18-NEXT: movl 24(%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shrl %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK18-NEXT: movl 28(%edx), %esi +; FALLBACK18-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 40(%edi), %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 32(%edx), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK18-NEXT: movl 44(%edi), %ecx -; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK18-NEXT: movl 36(%edx), %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK18-NEXT: movl %edi, %eax ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %eax, %esi -; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl 48(%edi), %esi +; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 40(%edx), %edi +; FALLBACK18-NEXT: movl %edi, %esi ; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK18-NEXT: movl 52(%edi), %esi -; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK18-NEXT: movl 44(%edx), %esi +; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %eax, %edi, %edi +; FALLBACK18-NEXT: movl %eax, %esi +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: shrl %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK18-NEXT: shrl %ecx -; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK18-NEXT: orl %eax, %ebp -; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: movl 48(%edx), %ebp +; FALLBACK18-NEXT: movl %ebp, %edi +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK18-NEXT: movl 52(%edx), %ecx +; FALLBACK18-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK18-NEXT: movl %esi, %ebp ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK18-NEXT: negl %eax -; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK18-NEXT: movl 56(%edi), %eax -; FALLBACK18-NEXT: shlxl %edx, %eax, %edx -; FALLBACK18-NEXT: shrl %esi -; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %edx, %esi ; FALLBACK18-NEXT: shrl %eax -; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK18-NEXT: orl %eax, %ecx -; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK18-NEXT: movl %edx, (%eax) -; FALLBACK18-NEXT: movl %esi, 56(%eax) -; FALLBACK18-NEXT: movl %ecx, 60(%eax) -; FALLBACK18-NEXT: movl %ebp, 48(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 40(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 44(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 32(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 36(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 24(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 28(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 16(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 20(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 8(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 12(%eax) -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK18-NEXT: movl %ecx, 4(%eax) +; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK18-NEXT: orl %edi, %esi +; FALLBACK18-NEXT: movl 56(%edx), %edi +; FALLBACK18-NEXT: shrl %ecx +; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK18-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: shrl %edi +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK18-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK18-NEXT: negl %ebx +; FALLBACK18-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK18-NEXT: orl %ecx, %ebx +; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK18-NEXT: movl %edi, (%edx) +; FALLBACK18-NEXT: movl %eax, 56(%edx) +; FALLBACK18-NEXT: movl %ebx, 60(%edx) +; FALLBACK18-NEXT: movl %esi, 48(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 52(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 40(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 44(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 32(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 36(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 24(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 28(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 16(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 20(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 8(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 12(%edx) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl %eax, 4(%edx) ; FALLBACK18-NEXT: addl $204, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi @@ -18342,144 +18301,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: leal (,%eax,8), %ebx +; FALLBACK22-NEXT: andl $24, %ebx +; FALLBACK22-NEXT: movl %ebx, %ecx ; FALLBACK22-NEXT: andl $60, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK22-NEXT: subl %eax, %edi -; FALLBACK22-NEXT: movl (%edi), %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 4(%edi), %eax +; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: subl %eax, %edx +; FALLBACK22-NEXT: movl (%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 4(%edx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: notb %bl -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK22-NEXT: orl %ecx, %esi +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK22-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK22-NEXT: orl %esi, %edi +; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 8(%edx), %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 8(%edi), %esi -; FALLBACK22-NEXT: movl %esi, %ecx -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 12(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, %esi, %esi -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 16(%edi), %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 20(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 12(%edx), %esi +; FALLBACK22-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %ecx, %edi +; FALLBACK22-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: orl %eax, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 24(%edi), %ecx +; FALLBACK22-NEXT: movl 16(%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 28(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: movl 20(%edx), %ecx +; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 32(%edi), %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: movl 36(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK22-NEXT: movl 24(%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrl %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK22-NEXT: movl 28(%edx), %esi +; FALLBACK22-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 40(%edi), %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 32(%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK22-NEXT: movl 44(%edi), %ecx -; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK22-NEXT: movl 36(%edx), %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %edi, %eax ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %eax, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 48(%edi), %esi +; FALLBACK22-NEXT: orl %ebp, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 40(%edx), %edi +; FALLBACK22-NEXT: movl %edi, %esi ; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK22-NEXT: movl 52(%edi), %esi -; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK22-NEXT: movl 44(%edx), %esi +; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %eax, %edi, %edi +; FALLBACK22-NEXT: movl %eax, %esi +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: shrl %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK22-NEXT: shrl %ecx -; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK22-NEXT: orl %eax, %ebp -; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: movl 48(%edx), %ebp +; FALLBACK22-NEXT: movl %ebp, %edi +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK22-NEXT: movl 52(%edx), %ecx +; FALLBACK22-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK22-NEXT: movl %esi, %ebp ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK22-NEXT: negl %eax -; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK22-NEXT: movl 56(%edi), %eax -; FALLBACK22-NEXT: shlxl %edx, %eax, %edx -; FALLBACK22-NEXT: shrl %esi -; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edx, %esi ; FALLBACK22-NEXT: shrl %eax -; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK22-NEXT: orl %eax, %ecx -; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK22-NEXT: movl %edx, (%eax) -; FALLBACK22-NEXT: movl %esi, 56(%eax) -; FALLBACK22-NEXT: movl %ecx, 60(%eax) -; FALLBACK22-NEXT: movl %ebp, 48(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 40(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 44(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 32(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 36(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 24(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 28(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 16(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 20(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 8(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 12(%eax) -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK22-NEXT: movl %ecx, 4(%eax) +; FALLBACK22-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK22-NEXT: orl %edi, %esi +; FALLBACK22-NEXT: movl 56(%edx), %edi +; FALLBACK22-NEXT: shrl %ecx +; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK22-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: shrl %edi +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK22-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK22-NEXT: negl %ebx +; FALLBACK22-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK22-NEXT: orl %ecx, %ebx +; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK22-NEXT: movl %edi, (%edx) +; FALLBACK22-NEXT: movl %eax, 56(%edx) +; FALLBACK22-NEXT: movl %ebx, 60(%edx) +; FALLBACK22-NEXT: movl %esi, 48(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 52(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 40(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 44(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 32(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 36(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 24(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 28(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 16(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 20(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 8(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 12(%edx) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl %eax, 4(%edx) ; FALLBACK22-NEXT: addl $204, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi @@ -18943,144 +18908,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: leal (,%eax,8), %ebx +; FALLBACK26-NEXT: andl $24, %ebx +; FALLBACK26-NEXT: movl %ebx, %ecx ; FALLBACK26-NEXT: andl $60, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK26-NEXT: subl %eax, %edi -; FALLBACK26-NEXT: movl (%edi), %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 4(%edi), %eax +; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: subl %eax, %edx +; FALLBACK26-NEXT: movl (%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 4(%edx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: notb %bl -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK26-NEXT: orl %ecx, %esi +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK26-NEXT: orl %esi, %edi +; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 8(%edx), %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 8(%edi), %esi -; FALLBACK26-NEXT: movl %esi, %ecx -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 12(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, %esi, %esi -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %esi, %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 16(%edi), %eax -; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 20(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 12(%edx), %esi +; FALLBACK26-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %ecx, %edi +; FALLBACK26-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: orl %eax, %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 24(%edi), %ecx +; FALLBACK26-NEXT: movl 16(%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 28(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: movl 20(%edx), %ecx +; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 32(%edi), %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: movl 36(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK26-NEXT: movl 24(%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shrl %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK26-NEXT: movl 28(%edx), %esi +; FALLBACK26-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 40(%edi), %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 32(%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK26-NEXT: movl 44(%edi), %ecx -; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK26-NEXT: movl 36(%edx), %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK26-NEXT: movl %edi, %eax ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %eax, %esi -; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl 48(%edi), %esi +; FALLBACK26-NEXT: orl %ebp, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 40(%edx), %edi +; FALLBACK26-NEXT: movl %edi, %esi ; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK26-NEXT: movl 52(%edi), %esi -; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK26-NEXT: movl 44(%edx), %esi +; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %eax, %edi, %edi +; FALLBACK26-NEXT: movl %eax, %esi +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: shrl %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK26-NEXT: shrl %ecx -; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK26-NEXT: orl %eax, %ebp -; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: movl 48(%edx), %ebp +; FALLBACK26-NEXT: movl %ebp, %edi +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK26-NEXT: movl 52(%edx), %ecx +; FALLBACK26-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK26-NEXT: movl %esi, %ebp ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK26-NEXT: negl %eax -; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK26-NEXT: movl 56(%edi), %eax -; FALLBACK26-NEXT: shlxl %edx, %eax, %edx -; FALLBACK26-NEXT: shrl %esi -; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %edx, %esi ; FALLBACK26-NEXT: shrl %eax -; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK26-NEXT: orl %eax, %ecx -; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK26-NEXT: movl %edx, (%eax) -; FALLBACK26-NEXT: movl %esi, 56(%eax) -; FALLBACK26-NEXT: movl %ecx, 60(%eax) -; FALLBACK26-NEXT: movl %ebp, 48(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 40(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 44(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 32(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 36(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 24(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 28(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 16(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 20(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 8(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 12(%eax) -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK26-NEXT: movl %ecx, 4(%eax) +; FALLBACK26-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK26-NEXT: orl %edi, %esi +; FALLBACK26-NEXT: movl 56(%edx), %edi +; FALLBACK26-NEXT: shrl %ecx +; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK26-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: shrl %edi +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK26-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK26-NEXT: negl %ebx +; FALLBACK26-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK26-NEXT: orl %ecx, %ebx +; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK26-NEXT: movl %edi, (%edx) +; FALLBACK26-NEXT: movl %eax, 56(%edx) +; FALLBACK26-NEXT: movl %ebx, 60(%edx) +; FALLBACK26-NEXT: movl %esi, 48(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 52(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 40(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 44(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 32(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 36(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 24(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 28(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 16(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 20(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 8(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 12(%edx) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl %eax, 4(%edx) ; FALLBACK26-NEXT: addl $204, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi @@ -19531,144 +19502,150 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: leal (,%eax,8), %ebx +; FALLBACK30-NEXT: andl $24, %ebx +; FALLBACK30-NEXT: movl %ebx, %ecx ; FALLBACK30-NEXT: andl $60, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi -; FALLBACK30-NEXT: subl %eax, %edi -; FALLBACK30-NEXT: movl (%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 4(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi -; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx -; FALLBACK30-NEXT: orl %ecx, %esi +; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: subl %eax, %edx +; FALLBACK30-NEXT: movl (%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 8(%edi), %esi -; FALLBACK30-NEXT: movl %esi, %ecx -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 12(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, %esi, %esi -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %esi, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 16(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 20(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 24(%edi), %ecx -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 28(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl 4(%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: shlxl %ecx, %eax, %esi +; FALLBACK30-NEXT: orl %esi, %edi +; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 8(%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 32(%edi), %eax -; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrl %eax -; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: movl 36(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: movl 12(%edx), %esi +; FALLBACK30-NEXT: shlxl %ecx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %ecx, %edi +; FALLBACK30-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: orl %eax, %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 40(%edi), %ecx +; FALLBACK30-NEXT: movl 16(%edx), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax -; FALLBACK30-NEXT: movl 44(%edi), %ecx -; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp +; FALLBACK30-NEXT: movl 20(%edx), %ecx +; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %esi -; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %eax, %esi -; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl 48(%edi), %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 24(%edx), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax -; FALLBACK30-NEXT: movl 52(%edi), %esi -; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp +; FALLBACK30-NEXT: movl 28(%edx), %esi +; FALLBACK30-NEXT: shlxl %edi, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %ecx -; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp -; FALLBACK30-NEXT: orl %eax, %ebp -; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK30-NEXT: negl %eax -; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx -; FALLBACK30-NEXT: movl 56(%edi), %eax -; FALLBACK30-NEXT: shlxl %edx, %eax, %edx +; FALLBACK30-NEXT: movl 32(%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: movl 36(%edx), %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edi, %ecx, %ebp +; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK30-NEXT: movl %edi, %eax ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %edx, %esi +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 40(%edx), %edi +; FALLBACK30-NEXT: movl %edi, %esi +; FALLBACK30-NEXT: shrl %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %ecx +; FALLBACK30-NEXT: movl 44(%edx), %esi +; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %esi, %ebp +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %eax, %edi, %edi +; FALLBACK30-NEXT: movl %eax, %esi +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax -; FALLBACK30-NEXT: orl %eax, %ecx -; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK30-NEXT: movl %edx, (%eax) -; FALLBACK30-NEXT: movl %esi, 56(%eax) -; FALLBACK30-NEXT: movl %ecx, 60(%eax) -; FALLBACK30-NEXT: movl %ebp, 48(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 40(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 44(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 32(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 36(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 24(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 28(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 16(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 20(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 8(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 12(%eax) -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK30-NEXT: movl %ecx, 4(%eax) +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 48(%edx), %ebp +; FALLBACK30-NEXT: movl %ebp, %edi +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %eax +; FALLBACK30-NEXT: movl 52(%edx), %ecx +; FALLBACK30-NEXT: shlxl %esi, %ecx, %edi +; FALLBACK30-NEXT: orl %edi, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %esi, %ebp, %edi +; FALLBACK30-NEXT: movl %esi, %ebp +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: shrl %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %esi +; FALLBACK30-NEXT: orl %edi, %esi +; FALLBACK30-NEXT: movl 56(%edx), %edi +; FALLBACK30-NEXT: shrl %ecx +; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax +; FALLBACK30-NEXT: shlxl %ebp, %edi, %ecx +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: shrl %edi +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ecx +; FALLBACK30-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK30-NEXT: negl %ebx +; FALLBACK30-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; FALLBACK30-NEXT: orl %ecx, %ebx +; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx +; FALLBACK30-NEXT: movl %edi, (%edx) +; FALLBACK30-NEXT: movl %eax, 56(%edx) +; FALLBACK30-NEXT: movl %ebx, 60(%edx) +; FALLBACK30-NEXT: movl %esi, 48(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 52(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 40(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 44(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 32(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 36(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 24(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 28(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 16(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 20(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 8(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 12(%edx) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl %eax, 4(%edx) ; FALLBACK30-NEXT: addl $204, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi @@ -20336,10 +20313,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK2-LABEL: ashr_64bytes: ; FALLBACK2: # %bb.0: -; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 -; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax @@ -20371,60 +20346,58 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx +; FALLBACK2-NEXT: movl %ecx, %esi ; FALLBACK2-NEXT: andl $56, %eax -; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx -; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 -; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi -; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 -; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 -; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp -; FALLBACK2-NEXT: movl %ecx, %r12d -; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 +; FALLBACK2-NEXT: movq -120(%rsp,%rax), %r8 +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r8, %r9 +; FALLBACK2-NEXT: notb %cl +; FALLBACK2-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK2-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK2-NEXT: orq %r9, %rdi +; FALLBACK2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9 +; FALLBACK2-NEXT: addq %r8, %r8 +; FALLBACK2-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK2-NEXT: orq %r9, %r8 +; FALLBACK2-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK2-NEXT: leaq (%r14,%r14), %r9 +; FALLBACK2-NEXT: shlxq %rcx, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 -; FALLBACK2-NEXT: addq %rdi, %rdi -; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi -; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 -; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx +; FALLBACK2-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK2-NEXT: orq %r10, %r11 +; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r10, %rbx +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r15 +; FALLBACK2-NEXT: leaq (%r15,%r15), %r12 +; FALLBACK2-NEXT: shlxq %rcx, %r12, %r12 +; FALLBACK2-NEXT: orq %rbx, %r12 +; FALLBACK2-NEXT: shrxq %rsi, %r14, %rbx ; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 -; FALLBACK2-NEXT: addq %rsi, %rsi -; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 -; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 -; FALLBACK2-NEXT: orq %r15, %r8 -; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 -; FALLBACK2-NEXT: addq %rax, %rax -; FALLBACK2-NEXT: shlxq %r12, %rax, %rax -; FALLBACK2-NEXT: orq %r13, %rax -; FALLBACK2-NEXT: movq %rcx, 56(%rdx) -; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) -; FALLBACK2-NEXT: movq %r8, 40(%rdx) -; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) -; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: shlxq %rcx, %r10, %r10 +; FALLBACK2-NEXT: orq %rbx, %r10 +; FALLBACK2-NEXT: shrxq %rsi, %r15, %rbx +; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax +; FALLBACK2-NEXT: leaq (%rax,%rax), %r14 +; FALLBACK2-NEXT: shlxq %rcx, %r14, %rcx +; FALLBACK2-NEXT: orq %rbx, %rcx +; FALLBACK2-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK2-NEXT: movq %rax, 56(%rdx) +; FALLBACK2-NEXT: movq %rcx, 48(%rdx) +; FALLBACK2-NEXT: movq %r10, 32(%rdx) +; FALLBACK2-NEXT: movq %r12, 40(%rdx) +; FALLBACK2-NEXT: movq %r11, 16(%rdx) +; FALLBACK2-NEXT: movq %r9, 24(%rdx) +; FALLBACK2-NEXT: movq %r8, (%rdx) +; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 -; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 -; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_64bytes: @@ -20664,13 +20637,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK6-LABEL: ashr_64bytes: ; FALLBACK6: # %bb.0: -; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx -; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 @@ -20691,62 +20662,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi +; FALLBACK6-NEXT: leal (,%rax,8), %ecx +; FALLBACK6-NEXT: andl $56, %ecx +; FALLBACK6-NEXT: movl %ecx, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl -; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK6-NEXT: notb %cl +; FALLBACK6-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK6-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK6-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK6-NEXT: orq %r8, %rdi +; FALLBACK6-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK6-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK6-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK6-NEXT: addq %r11, %r11 +; FALLBACK6-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK6-NEXT: orq %rbx, %r11 +; FALLBACK6-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK6-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK6-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK6-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK6-NEXT: orq %r15, %r13 +; FALLBACK6-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK6-NEXT: addq %rbx, %rbx +; FALLBACK6-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK6-NEXT: orq %r14, %rbx +; FALLBACK6-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK6-NEXT: orq %r9, %rdi -; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK6-NEXT: orq %r14, %r9 -; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK6-NEXT: orq %r15, %r10 -; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK6-NEXT: orq %r13, %rax -; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK6-NEXT: orq %rbp, %rcx -; FALLBACK6-NEXT: movq %rsi, 56(%rdx) +; FALLBACK6-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK6-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK6-NEXT: orq %r14, %r15 +; FALLBACK6-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK6-NEXT: addq %r9, %r9 +; FALLBACK6-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK6-NEXT: orq %r10, %rcx +; FALLBACK6-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK6-NEXT: movq %rax, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) -; FALLBACK6-NEXT: movq %rax, 48(%rdx) -; FALLBACK6-NEXT: movq %r10, 32(%rdx) -; FALLBACK6-NEXT: movq %r9, 40(%rdx) -; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r8, (%rdx) -; FALLBACK6-NEXT: addq $8, %rsp +; FALLBACK6-NEXT: movq %r15, 48(%rdx) +; FALLBACK6-NEXT: movq %rbx, 32(%rdx) +; FALLBACK6-NEXT: movq %r13, 40(%rdx) +; FALLBACK6-NEXT: movq %r11, 16(%rdx) +; FALLBACK6-NEXT: movq %r8, 24(%rdx) +; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 -; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_64bytes: @@ -20979,13 +20948,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK10-LABEL: ashr_64bytes: ; FALLBACK10: # %bb.0: -; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx -; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK10-NEXT: movq 48(%rdi), %rcx @@ -21004,62 +20971,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi +; FALLBACK10-NEXT: leal (,%rax,8), %ecx +; FALLBACK10-NEXT: andl $56, %ecx +; FALLBACK10-NEXT: movl %ecx, %esi ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl -; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 +; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK10-NEXT: notb %cl +; FALLBACK10-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK10-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK10-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK10-NEXT: orq %r8, %rdi +; FALLBACK10-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK10-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK10-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK10-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK10-NEXT: addq %r11, %r11 +; FALLBACK10-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK10-NEXT: orq %rbx, %r11 +; FALLBACK10-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK10-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK10-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK10-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK10-NEXT: orq %r15, %r13 +; FALLBACK10-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK10-NEXT: addq %rbx, %rbx +; FALLBACK10-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK10-NEXT: orq %r14, %rbx +; FALLBACK10-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK10-NEXT: orq %r9, %rdi -; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK10-NEXT: orq %r14, %r9 -; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK10-NEXT: orq %r15, %r10 -; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK10-NEXT: orq %r13, %rax -; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK10-NEXT: orq %rbp, %rcx -; FALLBACK10-NEXT: movq %rsi, 56(%rdx) +; FALLBACK10-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK10-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK10-NEXT: orq %r14, %r15 +; FALLBACK10-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK10-NEXT: addq %r9, %r9 +; FALLBACK10-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK10-NEXT: orq %r10, %rcx +; FALLBACK10-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK10-NEXT: movq %rax, 56(%rdx) ; FALLBACK10-NEXT: movq %rcx, 8(%rdx) -; FALLBACK10-NEXT: movq %rax, 48(%rdx) -; FALLBACK10-NEXT: movq %r10, 32(%rdx) -; FALLBACK10-NEXT: movq %r9, 40(%rdx) -; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r8, (%rdx) -; FALLBACK10-NEXT: addq $8, %rsp +; FALLBACK10-NEXT: movq %r15, 48(%rdx) +; FALLBACK10-NEXT: movq %rbx, 32(%rdx) +; FALLBACK10-NEXT: movq %r13, 40(%rdx) +; FALLBACK10-NEXT: movq %r11, 16(%rdx) +; FALLBACK10-NEXT: movq %r8, 24(%rdx) +; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 -; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; @@ -21292,13 +21257,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; ; FALLBACK14-LABEL: ashr_64bytes: ; FALLBACK14: # %bb.0: -; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx -; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK14-NEXT: movq 48(%rdi), %rcx @@ -21317,62 +21280,60 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; FALLBACK14-NEXT: leal (,%rax,8), %esi -; FALLBACK14-NEXT: andl $56, %esi +; FALLBACK14-NEXT: leal (,%rax,8), %ecx +; FALLBACK14-NEXT: andl $56, %ecx +; FALLBACK14-NEXT: movl %ecx, %esi ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 -; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx -; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi -; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12 -; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9 -; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10 -; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14 -; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK14-NEXT: movl %esi, %ebx -; FALLBACK14-NEXT: notb %bl -; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp -; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8 +; FALLBACK14-NEXT: notb %cl +; FALLBACK14-NEXT: movq -120(%rsp,%rax), %r10 +; FALLBACK14-NEXT: movq -112(%rsp,%rax), %r9 +; FALLBACK14-NEXT: leaq (%r10,%r10), %rdi +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %rdi +; FALLBACK14-NEXT: orq %r8, %rdi +; FALLBACK14-NEXT: movq -104(%rsp,%rax), %r11 +; FALLBACK14-NEXT: shrxq %rsi, %r11, %rbx +; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r14 +; FALLBACK14-NEXT: leaq (%r14,%r14), %r8 +; FALLBACK14-NEXT: shlxq %rcx, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: shrxq %rsi, %r9, %rbx +; FALLBACK14-NEXT: addq %r11, %r11 +; FALLBACK14-NEXT: shlxq %rcx, %r11, %r11 +; FALLBACK14-NEXT: orq %rbx, %r11 +; FALLBACK14-NEXT: movq -88(%rsp,%rax), %rbx +; FALLBACK14-NEXT: shrxq %rsi, %rbx, %r15 ; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 -; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13 -; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp +; FALLBACK14-NEXT: leaq (%r12,%r12), %r13 +; FALLBACK14-NEXT: shlxq %rcx, %r13, %r13 +; FALLBACK14-NEXT: orq %r15, %r13 +; FALLBACK14-NEXT: shrxq %rsi, %r14, %r14 +; FALLBACK14-NEXT: addq %rbx, %rbx +; FALLBACK14-NEXT: shlxq %rcx, %rbx, %rbx +; FALLBACK14-NEXT: orq %r14, %rbx +; FALLBACK14-NEXT: shrxq %rsi, %r12, %r14 ; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax -; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi -; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi -; FALLBACK14-NEXT: orq %r9, %rdi -; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 -; FALLBACK14-NEXT: orq %r14, %r9 -; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 -; FALLBACK14-NEXT: orq %r15, %r10 -; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax -; FALLBACK14-NEXT: orq %r13, %rax -; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx -; FALLBACK14-NEXT: orq %rbp, %rcx -; FALLBACK14-NEXT: movq %rsi, 56(%rdx) +; FALLBACK14-NEXT: leaq (%rax,%rax), %r15 +; FALLBACK14-NEXT: shlxq %rcx, %r15, %r15 +; FALLBACK14-NEXT: orq %r14, %r15 +; FALLBACK14-NEXT: shrxq %rsi, %r10, %r10 +; FALLBACK14-NEXT: addq %r9, %r9 +; FALLBACK14-NEXT: shlxq %rcx, %r9, %rcx +; FALLBACK14-NEXT: orq %r10, %rcx +; FALLBACK14-NEXT: sarxq %rsi, %rax, %rax +; FALLBACK14-NEXT: movq %rax, 56(%rdx) ; FALLBACK14-NEXT: movq %rcx, 8(%rdx) -; FALLBACK14-NEXT: movq %rax, 48(%rdx) -; FALLBACK14-NEXT: movq %r10, 32(%rdx) -; FALLBACK14-NEXT: movq %r9, 40(%rdx) -; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r8, (%rdx) -; FALLBACK14-NEXT: addq $8, %rsp +; FALLBACK14-NEXT: movq %r15, 48(%rdx) +; FALLBACK14-NEXT: movq %rbx, 32(%rdx) +; FALLBACK14-NEXT: movq %r13, 40(%rdx) +; FALLBACK14-NEXT: movq %r11, 16(%rdx) +; FALLBACK14-NEXT: movq %r8, 24(%rdx) +; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 -; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; @@ -21960,111 +21921,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: leal (,%eax,8), %edx ; FALLBACK18-NEXT: andl $24, %edx +; FALLBACK18-NEXT: movl %edx, %ebx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: movl %edx, %ebx -; FALLBACK18-NEXT: notb %bl +; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%edi,%edi), %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK18-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %esi +; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi -; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK18-NEXT: shlxl %edx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl %ecx, %ebp +; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK18-NEXT: movl %ecx, %edi -; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK18-NEXT: orl %ebp, %eax +; FALLBACK18-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %esi, %esi -; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK18-NEXT: orl %ecx, %esi -; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK18-NEXT: shrxl %edx, %eax, %edi -; FALLBACK18-NEXT: orl %edi, %ecx -; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: shlxl %edx, %esi, %eax +; FALLBACK18-NEXT: orl %ecx, %eax +; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK18-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK18-NEXT: leal (%edi,%edi), %ecx +; FALLBACK18-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK18-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK18-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK18-NEXT: orl %ebp, %esi +; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %eax, %eax -; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK18-NEXT: addl %ebp, %ebp -; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK18-NEXT: orl %eax, %ebx +; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK18-NEXT: orl %ebp, %ecx +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK18-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK18-NEXT: leal (%eax,%eax), %ebp +; FALLBACK18-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK18-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK18-NEXT: orl %edi, %edx +; FALLBACK18-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK18-NEXT: movl %edx, 60(%eax) -; FALLBACK18-NEXT: movl %ebx, 56(%eax) -; FALLBACK18-NEXT: movl %edi, 48(%eax) -; FALLBACK18-NEXT: movl %ecx, 52(%eax) -; FALLBACK18-NEXT: movl %esi, 40(%eax) +; FALLBACK18-NEXT: movl %edi, 60(%eax) +; FALLBACK18-NEXT: movl %edx, 56(%eax) +; FALLBACK18-NEXT: movl %ecx, 48(%eax) +; FALLBACK18-NEXT: movl %esi, 52(%eax) +; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -22664,111 +22626,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: leal (,%eax,8), %edx ; FALLBACK22-NEXT: andl $24, %edx +; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: andl $60, %ecx ; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl %edx, %ebx -; FALLBACK22-NEXT: notb %bl +; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: leal (%edi,%edi), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK22-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax ; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK22-NEXT: shlxl %edx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %ebp +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax ; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl %ecx, %edi -; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax +; FALLBACK22-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi -; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shlxl %edx, %esi, %eax +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK22-NEXT: leal (%edi,%edi), %ecx +; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK22-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK22-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK22-NEXT: orl %ebp, %esi +; FALLBACK22-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %eax, %eax -; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK22-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK22-NEXT: leal (%eax,%eax), %ebp +; FALLBACK22-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK22-NEXT: orl %edi, %edx +; FALLBACK22-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) -; FALLBACK22-NEXT: movl %edi, 48(%eax) -; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl %edi, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) +; FALLBACK22-NEXT: movl %ecx, 48(%eax) +; FALLBACK22-NEXT: movl %esi, 52(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -23326,111 +23289,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl %eax, %ecx ; FALLBACK26-NEXT: leal (,%eax,8), %edx ; FALLBACK26-NEXT: andl $24, %edx +; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: movl %edx, %ebx -; FALLBACK26-NEXT: notb %bl +; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: leal (%edi,%edi), %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK26-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK26-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %esi +; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi -; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK26-NEXT: shlxl %edx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl %ecx, %ebp +; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK26-NEXT: movl %ecx, %edi -; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK26-NEXT: orl %ebp, %eax +; FALLBACK26-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi -; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK26-NEXT: orl %ecx, %esi -; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK26-NEXT: shrxl %edx, %eax, %edi -; FALLBACK26-NEXT: orl %edi, %ecx -; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: shlxl %edx, %esi, %eax +; FALLBACK26-NEXT: orl %ecx, %eax +; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK26-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK26-NEXT: leal (%edi,%edi), %ecx +; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK26-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK26-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK26-NEXT: orl %ebp, %esi +; FALLBACK26-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %eax, %eax -; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK26-NEXT: addl %ebp, %ebp -; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK26-NEXT: orl %eax, %ebx +; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK26-NEXT: orl %ebp, %ecx +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK26-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK26-NEXT: leal (%eax,%eax), %ebp +; FALLBACK26-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK26-NEXT: orl %edi, %edx +; FALLBACK26-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK26-NEXT: movl %edx, 60(%eax) -; FALLBACK26-NEXT: movl %ebx, 56(%eax) -; FALLBACK26-NEXT: movl %edi, 48(%eax) -; FALLBACK26-NEXT: movl %ecx, 52(%eax) -; FALLBACK26-NEXT: movl %esi, 40(%eax) +; FALLBACK26-NEXT: movl %edi, 60(%eax) +; FALLBACK26-NEXT: movl %edx, 56(%eax) +; FALLBACK26-NEXT: movl %ecx, 48(%eax) +; FALLBACK26-NEXT: movl %esi, 52(%eax) +; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK26-NEXT: movl %ecx, 40(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 44(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -23988,111 +23952,112 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl %eax, %ecx ; FALLBACK30-NEXT: leal (,%eax,8), %edx ; FALLBACK30-NEXT: andl $24, %edx +; FALLBACK30-NEXT: movl %edx, %ebx ; FALLBACK30-NEXT: andl $60, %ecx ; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %eax +; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: movl %edx, %ebx -; FALLBACK30-NEXT: notb %bl +; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: leal (%edi,%edi), %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK30-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK30-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax ; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi +; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %esi +; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi -; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax +; FALLBACK30-NEXT: shlxl %edx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl %ecx, %ebp +; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax ; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi -; FALLBACK30-NEXT: movl %ecx, %edi -; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp -; FALLBACK30-NEXT: orl %ebp, %eax +; FALLBACK30-NEXT: shrxl %ebx, %esi, %edi +; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi -; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK30-NEXT: orl %ecx, %esi -; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp -; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx -; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax -; FALLBACK30-NEXT: shrxl %edx, %eax, %edi -; FALLBACK30-NEXT: orl %edi, %ecx -; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: shlxl %edx, %esi, %eax +; FALLBACK30-NEXT: orl %ecx, %eax +; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK30-NEXT: movl 120(%esp,%ebp), %edi +; FALLBACK30-NEXT: leal (%edi,%edi), %ecx +; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi +; FALLBACK30-NEXT: movl 116(%esp,%ebp), %eax +; FALLBACK30-NEXT: shrxl %ebx, %eax, %ebp +; FALLBACK30-NEXT: orl %ebp, %esi +; FALLBACK30-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %eax, %eax -; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi -; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx -; FALLBACK30-NEXT: addl %ebp, %ebp -; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK30-NEXT: orl %eax, %ebx +; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx +; FALLBACK30-NEXT: orl %ebp, %ecx +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK30-NEXT: movl 124(%esp,%eax), %eax +; FALLBACK30-NEXT: leal (%eax,%eax), %ebp +; FALLBACK30-NEXT: shlxl %edx, %ebp, %edx +; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi +; FALLBACK30-NEXT: orl %edi, %edx +; FALLBACK30-NEXT: sarxl %ebx, %eax, %edi ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK30-NEXT: movl %edx, 60(%eax) -; FALLBACK30-NEXT: movl %ebx, 56(%eax) -; FALLBACK30-NEXT: movl %edi, 48(%eax) -; FALLBACK30-NEXT: movl %ecx, 52(%eax) -; FALLBACK30-NEXT: movl %esi, 40(%eax) +; FALLBACK30-NEXT: movl %edi, 60(%eax) +; FALLBACK30-NEXT: movl %edx, 56(%eax) +; FALLBACK30-NEXT: movl %ecx, 48(%eax) +; FALLBACK30-NEXT: movl %esi, 52(%eax) +; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK30-NEXT: movl %ecx, 40(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 44(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 338e104fbe8f0..221a51ed44696 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -712,33 +712,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -994,42 +994,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 28(%esp,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1297,33 +1297,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1487,31 +1487,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes: @@ -1761,88 +1761,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2040,32 +2042,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; @@ -2319,97 +2321,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, 92(%esp,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -2610,31 +2616,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rsi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes: @@ -2927,60 +2933,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 32(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%esi) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3263,13 +3268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -3292,65 +3295,63 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: @@ -3868,20 +3869,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -3906,116 +3907,117 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4388,10 +4390,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax @@ -4419,63 +4419,61 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r15, %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r15, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rbx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes: @@ -4972,33 +4970,33 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 @@ -5011,7 +5009,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5032,149 +5030,152 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 60(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -5534,13 +5535,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 @@ -5567,65 +5566,63 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: @@ -6221,33 +6218,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi @@ -6256,87 +6251,84 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index c3054a365c466..6b5c6049f025b 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -1635,22 +1635,22 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1807,40 +1807,43 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi @@ -1906,13 +1909,13 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movb %cl, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2070,13 +2073,13 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movw %cx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2233,13 +2236,13 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-BMI2-NEXT: andl $56, %eax -; X64-BMI2-NEXT: andl $56, %esi -; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax +; X64-BMI2-NEXT: movl %eax, %ecx ; X64-BMI2-NEXT: notl %eax -; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-BMI2-NEXT: addl %esi, %esi -; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax +; X64-BMI2-NEXT: andl $56, %esi +; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-BMI2-NEXT: addl %edi, %edi +; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax +; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx ; X64-BMI2-NEXT: orl %eax, %ecx ; X64-BMI2-NEXT: movl %ecx, (%rdx) ; X64-BMI2-NEXT: popq %rax @@ -2521,10 +2524,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-HAVE-BMI2-NO-SHLD: # %bb.0: +; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -2541,25 +2545,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp,%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $140, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: retl %init = load <32 x i8>, ptr %src, align 1 %intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> @@ -2667,21 +2672,21 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r10, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq @@ -2860,33 +2865,33 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 16(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 8(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp @@ -3026,9 +3031,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; ; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X64-HAVE-BMI2-NO-SHLD: # %bb.0: -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1 ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 @@ -3043,38 +3046,36 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -3304,7 +3305,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 @@ -3320,59 +3321,60 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 32(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) @@ -3380,7 +3382,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $172, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 7735500bd3a88..bed8e5806380c 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -1879,22 +1879,22 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al -; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca: @@ -2055,40 +2055,43 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll index 4d261a9810896..9fbbba2ed3b47 100644 --- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -820,7 +820,7 @@ define void @infiniteloop() { ; ENABLE-NEXT: movq %rsp, %rax ; ENABLE-NEXT: addq $-16, %rax ; ENABLE-NEXT: movq %rax, %rsp -; ENABLE-NEXT: xorl %ecx, %ecx +; ENABLE-NEXT: xorl %ecx, %ecx ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB10_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -851,8 +851,8 @@ define void @infiniteloop() { ; DISABLE-NEXT: ## %bb.1: ## %if.then ; DISABLE-NEXT: movq %rsp, %rax ; DISABLE-NEXT: addq $-16, %rax -; DISABLE-NEXT: %rax, %rsp -; DISABLE-NEXT: xorl %ecx, %ecx +; DISABLE-NEXT: movq %rax, %rsp +; DISABLE-NEXT: xorl %ecx, %ecx ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB10_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 @@ -1185,10 +1185,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 { ; ENABLE-NEXT: .p2align 4 ; ENABLE-NEXT: LBB14_2: ## %for.body ; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; ENABLE-NEXT: cmpl %esi, %edi -; ENABLE-NEXT: setl %al +; ENABLE-NEXT: movl %esi, %eax ; ENABLE-NEXT: xorl %esi, %esi -; ENABLE-NEXT: movb %al, %sil +; ENABLE-NEXT: cmpl %eax, %edi +; ENABLE-NEXT: setl %sil ; ENABLE-NEXT: incb %dl ; ENABLE-NEXT: cmpb $45, %dl ; ENABLE-NEXT: jl LBB14_2 @@ -1220,10 +1220,10 @@ define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 { ; DISABLE-NEXT: .p2align 4 ; DISABLE-NEXT: LBB14_2: ## %for.body ; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1 -; DISABLE-NEXT: cmpl %esi, %edi -; DISABLE-NEXT: setl %al +; DISABLE-NEXT: movl %esi, %eax ; DISABLE-NEXT: xorl %esi, %esi -; DISABLE-NEXT: movb %al, %sil +; DISABLE-NEXT: cmpl %eax, %edi +; DISABLE-NEXT: setl %sil ; DISABLE-NEXT: incb %dl ; DISABLE-NEXT: cmpb $45, %dl ; DISABLE-NEXT: jl LBB14_2 diff --git a/llvm/test/CodeGen/X86/xor.ll b/llvm/test/CodeGen/X86/xor.ll index 2bef66825d8c0..59fbf7183abc6 100644 --- a/llvm/test/CodeGen/X86/xor.ll +++ b/llvm/test/CodeGen/X86/xor.ll @@ -62,12 +62,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB3_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB3_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -78,12 +78,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB3_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB3_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -94,12 +94,12 @@ define i32 @test4(i32 %a, i32 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB3_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB3_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -126,13 +126,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB4_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notl %edx -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: testw %dx, %dx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notl %ecx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: testw %cx, %cx ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: # kill: def $ax killed $ax killed $eax @@ -144,13 +144,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB4_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notl %ecx -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: testw %cx, %cx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %esi, %ecx +; X64-LIN-NEXT: xorl %ecx, %eax +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notl %esi +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi +; X64-LIN-NEXT: testw %si, %si ; X64-LIN-NEXT: jne .LBB4_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -163,13 +163,13 @@ define i16 @test5(i16 %a, i16 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB4_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notl %ecx -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: testw %cx, %cx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %edx, %ecx +; X64-WIN-NEXT: xorl %ecx, %eax +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notl %edx +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx +; X64-WIN-NEXT: testw %dx, %dx ; X64-WIN-NEXT: jne .LBB4_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: # kill: def $ax killed $ax killed $eax @@ -197,12 +197,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB5_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorb %cl, %al -; X86-NEXT: movl %eax, %edx -; X86-NEXT: notb %dl -; X86-NEXT: andb %cl, %dl -; X86-NEXT: addb %dl, %dl -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: notb %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: addb %cl, %cl ; X86-NEXT: jne .LBB5_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -213,12 +213,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB5_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorb %sil, %al -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: notb %cl -; X64-LIN-NEXT: andb %sil, %cl -; X64-LIN-NEXT: addb %cl, %cl -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: notb %sil +; X64-LIN-NEXT: andb %cl, %sil +; X64-LIN-NEXT: addb %sil, %sil ; X64-LIN-NEXT: jne .LBB5_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: # kill: def $al killed $al killed $eax @@ -230,12 +230,12 @@ define i8 @test6(i8 %a, i8 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB5_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorb %dl, %al -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: notb %cl -; X64-WIN-NEXT: andb %dl, %cl -; X64-WIN-NEXT: addb %cl, %cl -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: notb %dl +; X64-WIN-NEXT: andb %cl, %dl +; X64-WIN-NEXT: addb %dl, %dl ; X64-WIN-NEXT: jne .LBB5_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq @@ -262,12 +262,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB6_1: # %bb ; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: addl %edx, %edx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: # %bb12 ; X86-NEXT: retl @@ -278,12 +278,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X64-LIN-NEXT: .p2align 4 ; X64-LIN-NEXT: .LBB6_1: # %bb ; X64-LIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LIN-NEXT: movl %esi, %ecx ; X64-LIN-NEXT: xorl %esi, %eax -; X64-LIN-NEXT: movl %eax, %ecx -; X64-LIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-LIN-NEXT: andl %esi, %ecx -; X64-LIN-NEXT: addl %ecx, %ecx -; X64-LIN-NEXT: movl %ecx, %esi +; X64-LIN-NEXT: movl %eax, %esi +; X64-LIN-NEXT: xorl $2147483646, %esi # imm = 0x7FFFFFFE +; X64-LIN-NEXT: andl %ecx, %esi +; X64-LIN-NEXT: addl %esi, %esi ; X64-LIN-NEXT: jne .LBB6_1 ; X64-LIN-NEXT: # %bb.2: # %bb12 ; X64-LIN-NEXT: retq @@ -294,12 +294,12 @@ define i32 @test7(i32 %a, i32 %b) nounwind { ; X64-WIN-NEXT: .p2align 4 ; X64-WIN-NEXT: .LBB6_1: # %bb ; X64-WIN-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-WIN-NEXT: movl %edx, %ecx ; X64-WIN-NEXT: xorl %edx, %eax -; X64-WIN-NEXT: movl %eax, %ecx -; X64-WIN-NEXT: xorl $2147483646, %ecx # imm = 0x7FFFFFFE -; X64-WIN-NEXT: andl %edx, %ecx -; X64-WIN-NEXT: addl %ecx, %ecx -; X64-WIN-NEXT: movl %ecx, %edx +; X64-WIN-NEXT: movl %eax, %edx +; X64-WIN-NEXT: xorl $2147483646, %edx # imm = 0x7FFFFFFE +; X64-WIN-NEXT: andl %ecx, %edx +; X64-WIN-NEXT: addl %edx, %edx ; X64-WIN-NEXT: jne .LBB6_1 ; X64-WIN-NEXT: # %bb.2: # %bb12 ; X64-WIN-NEXT: retq diff --git a/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll b/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll new file mode 100644 index 0000000000000..aafeb5ceb0db3 --- /dev/null +++ b/llvm/test/DebugInfo/Generic/debuginfofinder-cu-source-language-names.ll @@ -0,0 +1,22 @@ +; RUN: opt -passes='print' -disable-output 2>&1 < %s \ +; RUN: | FileCheck %s + +; CHECK: Compile unit: DW_LANG_C99 from /tmp/test1.c +; CHECK: Compile unit: DW_LNAME_C from /tmp/test2.c +; CHECK: Compile unit: unknown-language(0) from /tmp/test3.c + +!llvm.dbg.cu = !{!0, !6, !10} +!llvm.module.flags = !{!8, !9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "test1.c", directory: "/tmp") +!2 = !{} +!3 = !DIFile(filename: "test1.c", directory: "/tmp") +!4 = !DISubroutineType(types: !7) +!5 = !{null} +!6 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_C, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !7, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!7 = !DIFile(filename: "test2.c", directory: "/tmp") +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = distinct !DICompileUnit(sourceLanguageName: 0, producer: "clang", isOptimized: false, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!11 = !DIFile(filename: "test3.c", directory: "/tmp") diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s index c393d3e819880..3f6d8feb45df0 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_err.s @@ -34,3 +34,83 @@ v_cvt_f32_bf16 v5, v1 div:2 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // GFX1250-ERR-NEXT:{{^}}v_cvt_f32_bf16 v5, v1 div:2 // GFX1250-ERR-NEXT:{{^}} ^ + +v_cos_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_cos_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_cos_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_exp_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_exp_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_exp_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_log_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_log_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_log_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rcp_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rcp_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_rcp_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rsq_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_rsq_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_rsq_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sin_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sin_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_sin_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sqrt_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_sqrt_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_sqrt_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ + +v_tanh_bf16 v1, v2 clamp +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 clamp +// GFX1250-ERR-NEXT:{{^}} ^ + +v_tanh_bf16 v1, v2 mul:2 +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. +// GFX1250-ERR-NEXT:{{^}}v_tanh_bf16 v1, v2 mul:2 +// GFX1250-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 0931523bbf40c..37ad6eb249da4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -3781,15 +3781,6 @@ v_tanh_bf16_e64 v5, null v_tanh_bf16_e64 v5, -1 // GFX1250: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - -v_tanh_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - -v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_prng_b32_e64 v5, v1 // GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] @@ -3862,15 +3853,6 @@ v_rcp_bf16_e64 v5, null v_rcp_bf16_e64 v5, -1 // GFX1250: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -v_rcp_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - -v_rcp_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - -v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sqrt_bf16_e64 v5, v1 // GFX1250: v_sqrt_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00] @@ -3907,15 +3889,6 @@ v_sqrt_bf16_e64 v5, null v_sqrt_bf16_e64 v5, -1 // GFX1250: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -v_sqrt_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - -v_sqrt_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - -v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rsq_bf16_e64 v5, v1 // GFX1250: v_rsq_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00] @@ -3952,15 +3925,6 @@ v_rsq_bf16_e64 v5, null v_rsq_bf16_e64 v5, -1 // GFX1250: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -v_rsq_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - -v_rsq_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - -v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_log_bf16_e64 v5, v1 // GFX1250: v_log_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00] @@ -3997,15 +3961,6 @@ v_log_bf16_e64 v5, null v_log_bf16_e64 v5, -1 // GFX1250: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -v_log_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - -v_log_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - -v_log_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_exp_bf16_e64 v5, v1 // GFX1250: v_exp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00] @@ -4042,15 +3997,6 @@ v_exp_bf16_e64 v5, null v_exp_bf16_e64 v5, -1 // GFX1250: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -v_exp_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - -v_exp_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - -v_exp_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sin_bf16_e64 v5, v1 // GFX1250: v_sin_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00] @@ -4087,15 +4033,6 @@ v_sin_bf16_e64 v5, null v_sin_bf16_e64 v5, -1 // GFX1250: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -v_sin_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - -v_sin_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - -v_sin_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cos_bf16_e64 v5, v1 // GFX1250: v_cos_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00] @@ -4132,15 +4069,6 @@ v_cos_bf16_e64 v5, null v_cos_bf16_e64 v5, -1 // GFX1250: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -v_cos_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - -v_cos_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - -v_cos_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 5ac9eb47381d6..52f9ba3a99483 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -3952,15 +3952,6 @@ v_tanh_bf16_e64 v5.l, null v_tanh_bf16_e64 v5.l, -1 // GFX1250: v_tanh_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - -v_tanh_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - -v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_tanh_bf16 v5.l, v128.h // GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00] @@ -4036,15 +4027,6 @@ v_rcp_bf16_e64 v5.l, null v_rcp_bf16_e64 v5.l, -1 // GFX1250: v_rcp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -v_rcp_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_rcp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - -v_rcp_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_rcp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - -v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rcp_bf16 v5.h, v128.h // GFX1250: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00] @@ -4084,15 +4066,6 @@ v_sqrt_bf16_e64 v5.l, null v_sqrt_bf16_e64 v5.l, -1 // GFX1250: v_sqrt_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -v_sqrt_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_sqrt_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - -v_sqrt_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_sqrt_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - -v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sqrt_bf16 v5.h, v128.h // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] @@ -4132,15 +4105,6 @@ v_rsq_bf16_e64 v5.l, null v_rsq_bf16_e64 v5.l, -1 // GFX1250: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -v_rsq_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - -v_rsq_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - -v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_rsq_bf16 v5.h, v128.h // GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] @@ -4180,15 +4144,6 @@ v_log_bf16_e64 v5.l, null v_log_bf16_e64 v5.l, -1 // GFX1250: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -v_log_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - -v_log_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - -v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_log_bf16 v5.h, v128.h // GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] @@ -4228,15 +4183,6 @@ v_exp_bf16_e64 v5.l, null v_exp_bf16_e64 v5.l, -1 // GFX1250: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -v_exp_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - -v_exp_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - -v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_exp_bf16 v5.h, v128.h // GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] @@ -4276,15 +4222,6 @@ v_sin_bf16_e64 v5.l, null v_sin_bf16_e64 v5.l, -1 // GFX1250: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -v_sin_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - -v_sin_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - -v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_sin_bf16 v5.h, v128.h // GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] @@ -4324,15 +4261,6 @@ v_cos_bf16_e64 v5.l, null v_cos_bf16_e64 v5.l, -1 // GFX1250: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -v_cos_bf16_e64 v5.l, 0.5 mul:2 -// GFX1250: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - -v_cos_bf16_e64 v5.l, src_scc mul:4 -// GFX1250: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - -v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 -// GFX1250: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - v_cos_bf16_e64 v5.h, v128.h // GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index b21fca654590a..21077fe4f9f05 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -158,18 +158,6 @@ v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -258,18 +246,6 @@ v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -314,18 +290,6 @@ v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -370,18 +334,6 @@ v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -426,18 +378,6 @@ v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -482,18 +422,6 @@ v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -538,18 +466,6 @@ v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -594,18 +510,6 @@ v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index d1638565a386a..646acf5219d7e 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -162,18 +162,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -266,18 +254,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rcp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -326,18 +302,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -386,18 +350,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -446,18 +398,6 @@ v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -506,18 +446,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -566,18 +494,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -626,18 +542,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 -// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 78afa10b984cb..1907a939b488b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -38,18 +38,6 @@ v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -58,114 +46,30 @@ v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 6ec4d5f48f8b1..35a51dbe9f922 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -42,18 +42,6 @@ v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -66,18 +54,6 @@ v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rcp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -86,18 +62,6 @@ v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -106,18 +70,6 @@ v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -126,18 +78,6 @@ v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -146,18 +86,6 @@ v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -166,18 +94,6 @@ v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -186,18 +102,6 @@ v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 -// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 -// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 67747a65ee52a..0b393973b7875 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -4123,18 +4123,10 @@ # GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_tanh_f16_e64 v5, v128 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] @@ -4159,10 +4151,6 @@ # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_tanh_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] @@ -4223,18 +4211,10 @@ 0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00 # GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] -0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_rcp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00] @@ -4259,10 +4239,6 @@ # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_rcp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_rcp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00] @@ -4287,18 +4263,10 @@ # GFX1250-REAL16: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xf9,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00] @@ -4323,10 +4291,6 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_sqrt_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_sqrt_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00] @@ -4351,18 +4315,10 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] @@ -4387,10 +4343,6 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] @@ -4415,18 +4367,10 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] @@ -4451,10 +4395,6 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] @@ -4479,18 +4419,10 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] @@ -4515,10 +4447,6 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] @@ -4543,18 +4471,10 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] @@ -4579,10 +4499,6 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] @@ -4607,18 +4523,10 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00] -0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 -# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] -# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] - 0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] -0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08 -# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] -# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] - 0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] @@ -4643,10 +4551,6 @@ # GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] -0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10 -# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] -# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] - 0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00 # GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 7c29f8ab01a1b..8b26d2a8696e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -104,18 +104,6 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -197,18 +185,6 @@ 0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff # GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -257,18 +233,6 @@ # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -317,18 +281,6 @@ # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -377,18 +329,6 @@ # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -437,18 +377,6 @@ # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -497,18 +425,6 @@ # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -557,18 +473,6 @@ # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - 0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index d26bc46a1f272..15f76c54a1c65 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -34,22 +34,10 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] @@ -57,142 +45,58 @@ 0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - 0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - 0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll index 2f7df0dbca303..8aeeda400ec6a 100644 --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -208,7 +208,6 @@ ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass -; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll index d2cac801816fe..73ebd23b730f5 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -132,7 +132,6 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll index beb988ddf3ae7..7d096590dc5fb 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -118,7 +118,6 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll index 66417b4ed532b..a82c18306128a 100644 --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -127,7 +127,6 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll index 31e4625b832b0..83aec1d1ad7f1 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll @@ -165,7 +165,6 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll index 530368fe07095..2e316342e99fe 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -167,7 +167,6 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll index 894b8770866b7..b7ae2560b31c6 100644 --- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -131,7 +131,6 @@ ; CHECK-O-NEXT: Running pass: BDCEPass ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass diff --git a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll index b1ef50382c070..c4933678d0391 100644 --- a/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll +++ b/llvm/test/Transforms/IndVarSimplify/floating-point-iv.ll @@ -417,3 +417,140 @@ loop: exit: ret void } + +define void @test_fp_to_int_irrealizable_initval() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_initval( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi float [ 1.000000e+08, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd float [[IV]], -1.700000e+01 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ult float [[IV_NEXT]], 2.500000e+01 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi float [ 1.000000e+08, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd float %iv, -1.700000e+01 + %cmp = fcmp ult float %iv.next, 2.500000e+01 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @test_fp_to_int_irrealizable_exitval() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi float [ 2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd float [[IV]], 1.700000e+01 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt float [[IV_NEXT]], 1.000000e+08 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi float [ 2.500000e+01, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd float %iv, 1.700000e+01 + %cmp = fcmp ugt float %iv.next, 1.000000e+08 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @test_fp_to_int_irrealizable_negative_exitval() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_negative_exitval( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi float [ -2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd float [[IV]], -1.700000e+01 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ult float [[IV_NEXT]], -1.000000e+08 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi float [ -2.500000e+01, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd float %iv, -1.700000e+01 + %cmp = fcmp ult float %iv.next, -1.000000e+08 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @test_fp_to_int_irrealizable_exitval_pow_2_24() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval_pow_2_24( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd float [[IV]], 1.000000e+00 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ugt float [[IV_NEXT]], 0x4170000000000000 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi float [ 0.000000e+00, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd float %iv, 1.000000e+00 + %cmp = fcmp ugt float %iv.next, 0x4170000000000000 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +define void @test_fp_to_int_irrealizable_exitval_int64_min() { +; CHECK-LABEL: @test_fp_to_int_irrealizable_exitval_int64_min( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi double [ 2.500000e+01, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: call void @opaque() +; CHECK-NEXT: [[IV_NEXT]] = fadd double [[IV]], 1.700000e+01 +; CHECK-NEXT: [[CMP:%.*]] = fcmp ult double [[IV_NEXT]], 0xC3E0000000000000 +; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi double [ 2.500000e+01, %entry ], [ %iv.next, %loop ] + call void @opaque() + %iv.next = fadd double %iv, 1.700000e+01 + %cmp = fcmp ult double %iv.next, 0xC3E0000000000000 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} + +declare void @opaque() diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll index c12d8135e5eba..082b876b542e5 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -234,16 +234,17 @@ define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %r ; X32-NEXT: .p2align 4 ; X32-NEXT: .LBB2_2: # %for.body ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movl (%ebx,%esi), %ebp -; X32-NEXT: addl (%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl (%esi,%ebx), %ebp -; X32-NEXT: movl %ebp, (%edx) -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: movl (%ebx,%esi), %ebx +; X32-NEXT: addl (%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl (%esi,%ebp), %ebx +; X32-NEXT: movl %ebx, (%edx) +; X32-NEXT: leal (%ebp,%esi), %ebx ; X32-NEXT: addl %edi, %ebx ; X32-NEXT: addl %ecx, %edx ; X32-NEXT: decl %eax diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index d8f1a86c9ebda..5b9bd0997f2fa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -182,313 +182,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: ; CHECK-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK: pred.load.if1: +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK: pred.load.continue2: ; CHECK-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK: pred.load.if3: +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK: pred.load.continue4: ; CHECK-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK: pred.load.if5: +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK: pred.load.continue6: ; CHECK-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK: pred.load.if7: +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK: pred.load.continue8: ; CHECK-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK: pred.load.if9: +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK: pred.load.continue10: ; CHECK-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK: pred.load.if11: +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK: pred.load.continue12: ; CHECK-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK: pred.load.if13: +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK: pred.load.continue14: ; CHECK-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK: pred.load.if15: +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK: pred.load.continue16: ; CHECK-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK: pred.load.if17: +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK: pred.load.continue18: ; CHECK-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK: pred.load.if19: +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK: pred.load.continue20: ; CHECK-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK: pred.load.if21: +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK: pred.load.continue22: ; CHECK-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK: pred.load.if23: +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK: pred.load.continue24: ; CHECK-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK: pred.load.if25: +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK: pred.load.continue26: ; CHECK-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK: pred.load.if27: +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK: pred.load.continue28: ; CHECK-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK: pred.load.if29: +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK: pred.load.continue30: -; CHECK-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK: pred.load.if31: -; CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK: pred.load.continue32: -; CHECK-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK: pred.load.if33: -; CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK: pred.load.continue34: -; CHECK-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK: pred.load.if35: -; CHECK-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK: pred.load.continue36: -; CHECK-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK: pred.load.if37: -; CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK: pred.load.continue38: -; CHECK-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK: pred.load.if39: -; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK: pred.load.continue40: -; CHECK-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK: pred.load.if41: -; CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK: pred.load.continue42: -; CHECK-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK: pred.load.if43: -; CHECK-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK: pred.load.continue44: -; CHECK-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK: pred.load.if45: -; CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK: pred.load.continue46: -; CHECK-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK: pred.load.if47: -; CHECK-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK: pred.load.continue48: -; CHECK-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK: pred.load.if49: -; CHECK-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK: pred.load.continue50: -; CHECK-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK: pred.load.if51: -; CHECK-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK: pred.load.continue52: -; CHECK-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK: pred.load.if53: -; CHECK-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK: pred.load.continue54: -; CHECK-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK: pred.load.if55: -; CHECK-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK: pred.load.continue56: -; CHECK-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK: pred.load.if57: -; CHECK-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK: pred.load.continue58: -; CHECK-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK: pred.load.if59: -; CHECK-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK: pred.load.continue60: -; CHECK-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK: pred.load.if61: ; CHECK-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK: pred.load.continue62: -; CHECK-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK: pred.load.continue30: +; CHECK-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index e74830700776c..6ead2a4eecbe8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -997,313 +997,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-INTERLEAVE1-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-INTERLEAVE1: pred.load.if: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-INTERLEAVE1: pred.load.continue: ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-INTERLEAVE1: pred.load.if1: +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-INTERLEAVE1: pred.load.continue2: ; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-INTERLEAVE1: pred.load.if3: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-INTERLEAVE1: pred.load.continue4: ; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-INTERLEAVE1: pred.load.if5: +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-INTERLEAVE1: pred.load.continue6: ; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-INTERLEAVE1: pred.load.if7: +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-INTERLEAVE1: pred.load.continue8: ; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-INTERLEAVE1: pred.load.if9: +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVE1-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-INTERLEAVE1: pred.load.continue10: ; CHECK-INTERLEAVE1-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-INTERLEAVE1: pred.load.if11: +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-INTERLEAVE1-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-INTERLEAVE1: pred.load.continue12: ; CHECK-INTERLEAVE1-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-INTERLEAVE1: pred.load.if13: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-INTERLEAVE1-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-INTERLEAVE1-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-INTERLEAVE1: pred.load.continue14: ; CHECK-INTERLEAVE1-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-INTERLEAVE1: pred.load.if15: +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-INTERLEAVE1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-INTERLEAVE1-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-INTERLEAVE1: pred.load.continue16: ; CHECK-INTERLEAVE1-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-INTERLEAVE1: pred.load.if17: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-INTERLEAVE1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-INTERLEAVE1: pred.load.continue18: ; CHECK-INTERLEAVE1-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-INTERLEAVE1: pred.load.if19: +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-INTERLEAVE1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-INTERLEAVE1-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-INTERLEAVE1: pred.load.continue20: ; CHECK-INTERLEAVE1-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-INTERLEAVE1: pred.load.if21: +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-INTERLEAVE1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-INTERLEAVE1-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-INTERLEAVE1: pred.load.continue22: ; CHECK-INTERLEAVE1-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-INTERLEAVE1: pred.load.if23: +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-INTERLEAVE1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-INTERLEAVE1-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-INTERLEAVE1: pred.load.continue24: ; CHECK-INTERLEAVE1-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-INTERLEAVE1: pred.load.if25: +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-INTERLEAVE1-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-INTERLEAVE1-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-INTERLEAVE1: pred.load.continue26: ; CHECK-INTERLEAVE1-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-INTERLEAVE1: pred.load.if27: +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-INTERLEAVE1-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-INTERLEAVE1-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVE1-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-INTERLEAVE1: pred.load.continue28: ; CHECK-INTERLEAVE1-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-INTERLEAVE1: pred.load.if29: +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVE1-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-INTERLEAVE1: pred.load.continue30: -; CHECK-INTERLEAVE1-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-INTERLEAVE1: pred.load.if31: -; CHECK-INTERLEAVE1-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVE1-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-INTERLEAVE1: pred.load.continue32: -; CHECK-INTERLEAVE1-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-INTERLEAVE1: pred.load.if33: -; CHECK-INTERLEAVE1-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-INTERLEAVE1: pred.load.continue34: -; CHECK-INTERLEAVE1-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-INTERLEAVE1: pred.load.if35: -; CHECK-INTERLEAVE1-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVE1-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-INTERLEAVE1: pred.load.continue36: -; CHECK-INTERLEAVE1-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-INTERLEAVE1: pred.load.if37: -; CHECK-INTERLEAVE1-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-INTERLEAVE1: pred.load.continue38: -; CHECK-INTERLEAVE1-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-INTERLEAVE1: pred.load.if39: -; CHECK-INTERLEAVE1-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-INTERLEAVE1: pred.load.continue40: -; CHECK-INTERLEAVE1-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-INTERLEAVE1: pred.load.if41: -; CHECK-INTERLEAVE1-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVE1-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-INTERLEAVE1: pred.load.continue42: -; CHECK-INTERLEAVE1-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-INTERLEAVE1: pred.load.if43: -; CHECK-INTERLEAVE1-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-INTERLEAVE1: pred.load.continue44: -; CHECK-INTERLEAVE1-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-INTERLEAVE1: pred.load.if45: -; CHECK-INTERLEAVE1-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVE1-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-INTERLEAVE1: pred.load.continue46: -; CHECK-INTERLEAVE1-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-INTERLEAVE1: pred.load.if47: -; CHECK-INTERLEAVE1-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-INTERLEAVE1: pred.load.continue48: -; CHECK-INTERLEAVE1-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-INTERLEAVE1: pred.load.if49: -; CHECK-INTERLEAVE1-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-INTERLEAVE1: pred.load.continue50: -; CHECK-INTERLEAVE1-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-INTERLEAVE1: pred.load.if51: -; CHECK-INTERLEAVE1-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-INTERLEAVE1: pred.load.continue52: -; CHECK-INTERLEAVE1-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-INTERLEAVE1: pred.load.if53: -; CHECK-INTERLEAVE1-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVE1-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-INTERLEAVE1: pred.load.continue54: -; CHECK-INTERLEAVE1-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-INTERLEAVE1: pred.load.if55: -; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-INTERLEAVE1: pred.load.continue56: -; CHECK-INTERLEAVE1-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-INTERLEAVE1: pred.load.if57: -; CHECK-INTERLEAVE1-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-INTERLEAVE1: pred.load.continue58: -; CHECK-INTERLEAVE1-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-INTERLEAVE1: pred.load.if59: -; CHECK-INTERLEAVE1-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVE1-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-INTERLEAVE1: pred.load.continue60: -; CHECK-INTERLEAVE1-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-INTERLEAVE1-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVE1: pred.load.if61: ; CHECK-INTERLEAVE1-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-INTERLEAVE1-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-INTERLEAVE1-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVE1: pred.load.continue62: -; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVE1: pred.load.continue30: +; CHECK-INTERLEAVE1-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-INTERLEAVE1-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-INTERLEAVE1-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) @@ -1333,313 +1253,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-INTERLEAVED: pred.load.if: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-INTERLEAVED: pred.load.continue: ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-INTERLEAVED: pred.load.if1: +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-INTERLEAVED: pred.load.continue2: ; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-INTERLEAVED: pred.load.if3: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-INTERLEAVED: pred.load.continue4: ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-INTERLEAVED: pred.load.if5: +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-INTERLEAVED: pred.load.continue6: ; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-INTERLEAVED: pred.load.if7: +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-INTERLEAVED: pred.load.continue8: ; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-INTERLEAVED: pred.load.if9: +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-INTERLEAVED: pred.load.continue10: ; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-INTERLEAVED: pred.load.if11: +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-INTERLEAVED: pred.load.continue12: ; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-INTERLEAVED: pred.load.if13: +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-INTERLEAVED: pred.load.continue14: ; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-INTERLEAVED: pred.load.if15: +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-INTERLEAVED: pred.load.continue16: ; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-INTERLEAVED: pred.load.if17: +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-INTERLEAVED: pred.load.continue18: ; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-INTERLEAVED: pred.load.if19: +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-INTERLEAVED: pred.load.continue20: ; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-INTERLEAVED: pred.load.if21: +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-INTERLEAVED: pred.load.continue22: ; CHECK-INTERLEAVED-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-INTERLEAVED: pred.load.if23: +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-INTERLEAVED-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-INTERLEAVED: pred.load.continue24: ; CHECK-INTERLEAVED-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-INTERLEAVED: pred.load.if25: +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-INTERLEAVED-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-INTERLEAVED-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-INTERLEAVED: pred.load.continue26: ; CHECK-INTERLEAVED-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-INTERLEAVED: pred.load.if27: +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-INTERLEAVED-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-INTERLEAVED: pred.load.continue28: ; CHECK-INTERLEAVED-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-INTERLEAVED: pred.load.if29: +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-INTERLEAVED-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-INTERLEAVED: pred.load.continue30: -; CHECK-INTERLEAVED-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-INTERLEAVED: pred.load.if31: -; CHECK-INTERLEAVED-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-INTERLEAVED: pred.load.continue32: -; CHECK-INTERLEAVED-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-INTERLEAVED: pred.load.if33: -; CHECK-INTERLEAVED-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-INTERLEAVED: pred.load.continue34: -; CHECK-INTERLEAVED-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-INTERLEAVED: pred.load.if35: -; CHECK-INTERLEAVED-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-INTERLEAVED: pred.load.continue36: -; CHECK-INTERLEAVED-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-INTERLEAVED: pred.load.if37: -; CHECK-INTERLEAVED-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-INTERLEAVED-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-INTERLEAVED: pred.load.continue38: -; CHECK-INTERLEAVED-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-INTERLEAVED: pred.load.if39: -; CHECK-INTERLEAVED-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-INTERLEAVED: pred.load.continue40: -; CHECK-INTERLEAVED-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-INTERLEAVED: pred.load.if41: -; CHECK-INTERLEAVED-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-INTERLEAVED: pred.load.continue42: -; CHECK-INTERLEAVED-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-INTERLEAVED: pred.load.if43: -; CHECK-INTERLEAVED-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-INTERLEAVED: pred.load.continue44: -; CHECK-INTERLEAVED-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-INTERLEAVED: pred.load.if45: -; CHECK-INTERLEAVED-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-INTERLEAVED-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-INTERLEAVED: pred.load.continue46: -; CHECK-INTERLEAVED-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-INTERLEAVED: pred.load.if47: -; CHECK-INTERLEAVED-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-INTERLEAVED: pred.load.continue48: -; CHECK-INTERLEAVED-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-INTERLEAVED: pred.load.if49: -; CHECK-INTERLEAVED-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-INTERLEAVED: pred.load.continue50: -; CHECK-INTERLEAVED-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-INTERLEAVED: pred.load.if51: -; CHECK-INTERLEAVED-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-INTERLEAVED: pred.load.continue52: -; CHECK-INTERLEAVED-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-INTERLEAVED: pred.load.if53: -; CHECK-INTERLEAVED-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-INTERLEAVED: pred.load.continue54: -; CHECK-INTERLEAVED-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-INTERLEAVED: pred.load.if55: -; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-INTERLEAVED: pred.load.continue56: -; CHECK-INTERLEAVED-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-INTERLEAVED: pred.load.if57: -; CHECK-INTERLEAVED-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-INTERLEAVED: pred.load.continue58: -; CHECK-INTERLEAVED-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-INTERLEAVED: pred.load.if59: -; CHECK-INTERLEAVED-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-INTERLEAVED: pred.load.continue60: -; CHECK-INTERLEAVED-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-INTERLEAVED-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVED: pred.load.if61: ; CHECK-INTERLEAVED-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-INTERLEAVED-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-INTERLEAVED-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-INTERLEAVED: pred.load.continue62: -; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-INTERLEAVED: pred.load.continue30: +; CHECK-INTERLEAVED-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-INTERLEAVED-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) @@ -1669,313 +1509,233 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ] ; CHECK-MAXBW-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ] ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[PRED_LOAD_CONTINUE62]] ] -; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 -; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 -; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 -; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 -; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 -; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 -; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 -; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 ; CHECK-MAXBW-NEXT: br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK-MAXBW: pred.load.if: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]] ; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 ; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> poison, i8 [[TMP19]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] +; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = load i8, ptr [[TMP99]], align 1 +; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = insertelement <16 x i8> poison, i8 [[TMP101]], i32 0 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK-MAXBW: pred.load.continue: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ] +; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP102]], [[PRED_LOAD_IF]] ] ; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 ; CHECK-MAXBW-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] ; CHECK-MAXBW: pred.load.if1: +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1 ; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP24]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 +; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = insertelement <16 x i8> [[TMP103]], i8 [[TMP105]], i32 1 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE2]] ; CHECK-MAXBW: pred.load.continue2: ; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = phi <16 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP25]], [[PRED_LOAD_IF1]] ] +; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = phi <16 x i8> [ [[TMP103]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP109]], [[PRED_LOAD_IF1]] ] ; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 ; CHECK-MAXBW-NEXT: br i1 [[TMP27]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] ; CHECK-MAXBW: pred.load.if3: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 ; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]] ; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 ; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP29]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = load i8, ptr [[TMP112]], align 1 +; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = insertelement <16 x i8> [[TMP111]], i8 [[TMP113]], i32 2 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE4]] ; CHECK-MAXBW: pred.load.continue4: ; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = phi <16 x i8> [ [[TMP26]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP30]], [[PRED_LOAD_IF3]] ] +; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = phi <16 x i8> [ [[TMP111]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP114]], [[PRED_LOAD_IF3]] ] ; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 ; CHECK-MAXBW-NEXT: br i1 [[TMP32]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] ; CHECK-MAXBW: pred.load.if5: +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 ; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]] ; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = load i8, ptr [[TMP33]], align 1 ; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP34]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = load i8, ptr [[TMP119]], align 1 +; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = insertelement <16 x i8> [[TMP115]], i8 [[TMP121]], i32 3 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE6]] ; CHECK-MAXBW: pred.load.continue6: ; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = phi <16 x i8> [ [[TMP31]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP35]], [[PRED_LOAD_IF5]] ] +; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = phi <16 x i8> [ [[TMP115]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP122]], [[PRED_LOAD_IF5]] ] ; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 ; CHECK-MAXBW-NEXT: br i1 [[TMP37]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]] ; CHECK-MAXBW: pred.load.if7: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 ; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP4]] ; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = load i8, ptr [[TMP38]], align 1 ; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = insertelement <16 x i8> [[TMP36]], i8 [[TMP39]], i32 4 +; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 +; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = insertelement <16 x i8> [[TMP123]], i8 [[TMP125]], i32 4 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE8]] ; CHECK-MAXBW: pred.load.continue8: ; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = phi <16 x i8> [ [[TMP36]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP40]], [[PRED_LOAD_IF7]] ] +; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = phi <16 x i8> [ [[TMP123]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP129]], [[PRED_LOAD_IF7]] ] ; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 ; CHECK-MAXBW-NEXT: br i1 [[TMP42]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]] ; CHECK-MAXBW: pred.load.if9: +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 ; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP5]] ; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = load i8, ptr [[TMP43]], align 1 ; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = insertelement <16 x i8> [[TMP41]], i8 [[TMP44]], i32 5 +; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = load i8, ptr [[TMP132]], align 1 +; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = insertelement <16 x i8> [[TMP131]], i8 [[TMP133]], i32 5 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE10]] ; CHECK-MAXBW: pred.load.continue10: ; CHECK-MAXBW-NEXT: [[TMP46:%.*]] = phi <16 x i8> [ [[TMP41]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP45]], [[PRED_LOAD_IF9]] ] +; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = phi <16 x i8> [ [[TMP131]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP134]], [[PRED_LOAD_IF9]] ] ; CHECK-MAXBW-NEXT: [[TMP47:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 ; CHECK-MAXBW-NEXT: br i1 [[TMP47]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK-MAXBW: pred.load.if11: +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 ; CHECK-MAXBW-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]] ; CHECK-MAXBW-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 ; CHECK-MAXBW-NEXT: [[TMP50:%.*]] = insertelement <16 x i8> [[TMP46]], i8 [[TMP49]], i32 6 +; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = load i8, ptr [[TMP139]], align 1 +; CHECK-MAXBW-NEXT: [[TMP142:%.*]] = insertelement <16 x i8> [[TMP135]], i8 [[TMP141]], i32 6 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK-MAXBW: pred.load.continue12: ; CHECK-MAXBW-NEXT: [[TMP51:%.*]] = phi <16 x i8> [ [[TMP46]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP50]], [[PRED_LOAD_IF11]] ] +; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = phi <16 x i8> [ [[TMP135]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP142]], [[PRED_LOAD_IF11]] ] ; CHECK-MAXBW-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 ; CHECK-MAXBW-NEXT: br i1 [[TMP52]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK-MAXBW: pred.load.if13: +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 ; CHECK-MAXBW-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]] ; CHECK-MAXBW-NEXT: [[TMP54:%.*]] = load i8, ptr [[TMP53]], align 1 ; CHECK-MAXBW-NEXT: [[TMP55:%.*]] = insertelement <16 x i8> [[TMP51]], i8 [[TMP54]], i32 7 +; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 +; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = insertelement <16 x i8> [[TMP143]], i8 [[TMP145]], i32 7 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK-MAXBW: pred.load.continue14: ; CHECK-MAXBW-NEXT: [[TMP56:%.*]] = phi <16 x i8> [ [[TMP51]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP55]], [[PRED_LOAD_IF13]] ] +; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = phi <16 x i8> [ [[TMP143]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP149]], [[PRED_LOAD_IF13]] ] ; CHECK-MAXBW-NEXT: [[TMP57:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 ; CHECK-MAXBW-NEXT: br i1 [[TMP57]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK-MAXBW: pred.load.if15: +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 8 ; CHECK-MAXBW-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-MAXBW-NEXT: [[TMP59:%.*]] = load i8, ptr [[TMP58]], align 1 ; CHECK-MAXBW-NEXT: [[TMP60:%.*]] = insertelement <16 x i8> [[TMP56]], i8 [[TMP59]], i32 8 +; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] +; CHECK-MAXBW-NEXT: [[TMP152:%.*]] = load i8, ptr [[TMP151]], align 1 +; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = insertelement <16 x i8> [[TMP150]], i8 [[TMP152]], i32 8 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK-MAXBW: pred.load.continue16: ; CHECK-MAXBW-NEXT: [[TMP61:%.*]] = phi <16 x i8> [ [[TMP56]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP60]], [[PRED_LOAD_IF15]] ] +; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = phi <16 x i8> [ [[TMP150]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP153]], [[PRED_LOAD_IF15]] ] ; CHECK-MAXBW-NEXT: [[TMP62:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 ; CHECK-MAXBW-NEXT: br i1 [[TMP62]], label [[PRED_LOAD_IF17:%.*]], label [[PRED_LOAD_CONTINUE18:%.*]] ; CHECK-MAXBW: pred.load.if17: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 9 ; CHECK-MAXBW-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP64:%.*]] = load i8, ptr [[TMP63]], align 1 ; CHECK-MAXBW-NEXT: [[TMP65:%.*]] = insertelement <16 x i8> [[TMP61]], i8 [[TMP64]], i32 9 +; CHECK-MAXBW-NEXT: [[TMP96:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] +; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP96]], align 1 +; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = insertelement <16 x i8> [[TMP154]], i8 [[TMP155]], i32 9 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE18]] ; CHECK-MAXBW: pred.load.continue18: ; CHECK-MAXBW-NEXT: [[TMP66:%.*]] = phi <16 x i8> [ [[TMP61]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP65]], [[PRED_LOAD_IF17]] ] +; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = phi <16 x i8> [ [[TMP154]], [[PRED_LOAD_CONTINUE16]] ], [ [[TMP98]], [[PRED_LOAD_IF17]] ] ; CHECK-MAXBW-NEXT: [[TMP67:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 ; CHECK-MAXBW-NEXT: br i1 [[TMP67]], label [[PRED_LOAD_IF19:%.*]], label [[PRED_LOAD_CONTINUE20:%.*]] ; CHECK-MAXBW: pred.load.if19: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 10 ; CHECK-MAXBW-NEXT: [[TMP68:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]] ; CHECK-MAXBW-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP68]], align 1 ; CHECK-MAXBW-NEXT: [[TMP70:%.*]] = insertelement <16 x i8> [[TMP66]], i8 [[TMP69]], i32 10 +; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = load i8, ptr [[TMP106]], align 1 +; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = insertelement <16 x i8> [[TMP100]], i8 [[TMP107]], i32 10 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE20]] ; CHECK-MAXBW: pred.load.continue20: ; CHECK-MAXBW-NEXT: [[TMP71:%.*]] = phi <16 x i8> [ [[TMP66]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP70]], [[PRED_LOAD_IF19]] ] +; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = phi <16 x i8> [ [[TMP100]], [[PRED_LOAD_CONTINUE18]] ], [ [[TMP108]], [[PRED_LOAD_IF19]] ] ; CHECK-MAXBW-NEXT: [[TMP72:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 ; CHECK-MAXBW-NEXT: br i1 [[TMP72]], label [[PRED_LOAD_IF21:%.*]], label [[PRED_LOAD_CONTINUE22:%.*]] ; CHECK-MAXBW: pred.load.if21: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 11 ; CHECK-MAXBW-NEXT: [[TMP73:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-MAXBW-NEXT: [[TMP74:%.*]] = load i8, ptr [[TMP73]], align 1 ; CHECK-MAXBW-NEXT: [[TMP75:%.*]] = insertelement <16 x i8> [[TMP71]], i8 [[TMP74]], i32 11 +; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = load i8, ptr [[TMP116]], align 1 +; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = insertelement <16 x i8> [[TMP110]], i8 [[TMP117]], i32 11 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE22]] ; CHECK-MAXBW: pred.load.continue22: ; CHECK-MAXBW-NEXT: [[TMP76:%.*]] = phi <16 x i8> [ [[TMP71]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP75]], [[PRED_LOAD_IF21]] ] +; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = phi <16 x i8> [ [[TMP110]], [[PRED_LOAD_CONTINUE20]] ], [ [[TMP118]], [[PRED_LOAD_IF21]] ] ; CHECK-MAXBW-NEXT: [[TMP77:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 ; CHECK-MAXBW-NEXT: br i1 [[TMP77]], label [[PRED_LOAD_IF23:%.*]], label [[PRED_LOAD_CONTINUE24:%.*]] ; CHECK-MAXBW: pred.load.if23: +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 12 ; CHECK-MAXBW-NEXT: [[TMP78:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]] ; CHECK-MAXBW-NEXT: [[TMP79:%.*]] = load i8, ptr [[TMP78]], align 1 ; CHECK-MAXBW-NEXT: [[TMP80:%.*]] = insertelement <16 x i8> [[TMP76]], i8 [[TMP79]], i32 12 +; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] +; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = load i8, ptr [[TMP126]], align 1 +; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = insertelement <16 x i8> [[TMP120]], i8 [[TMP127]], i32 12 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE24]] ; CHECK-MAXBW: pred.load.continue24: ; CHECK-MAXBW-NEXT: [[TMP81:%.*]] = phi <16 x i8> [ [[TMP76]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP80]], [[PRED_LOAD_IF23]] ] +; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = phi <16 x i8> [ [[TMP120]], [[PRED_LOAD_CONTINUE22]] ], [ [[TMP128]], [[PRED_LOAD_IF23]] ] ; CHECK-MAXBW-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 ; CHECK-MAXBW-NEXT: br i1 [[TMP82]], label [[PRED_LOAD_IF25:%.*]], label [[PRED_LOAD_CONTINUE26:%.*]] ; CHECK-MAXBW: pred.load.if25: +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 13 ; CHECK-MAXBW-NEXT: [[TMP83:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP83]], align 1 ; CHECK-MAXBW-NEXT: [[TMP85:%.*]] = insertelement <16 x i8> [[TMP81]], i8 [[TMP84]], i32 13 +; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] +; CHECK-MAXBW-NEXT: [[TMP137:%.*]] = load i8, ptr [[TMP136]], align 1 +; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = insertelement <16 x i8> [[TMP130]], i8 [[TMP137]], i32 13 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE26]] ; CHECK-MAXBW: pred.load.continue26: ; CHECK-MAXBW-NEXT: [[TMP86:%.*]] = phi <16 x i8> [ [[TMP81]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP85]], [[PRED_LOAD_IF25]] ] +; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = phi <16 x i8> [ [[TMP130]], [[PRED_LOAD_CONTINUE24]] ], [ [[TMP138]], [[PRED_LOAD_IF25]] ] ; CHECK-MAXBW-NEXT: [[TMP87:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 ; CHECK-MAXBW-NEXT: br i1 [[TMP87]], label [[PRED_LOAD_IF27:%.*]], label [[PRED_LOAD_CONTINUE28:%.*]] ; CHECK-MAXBW: pred.load.if27: +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 14 ; CHECK-MAXBW-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]] ; CHECK-MAXBW-NEXT: [[TMP89:%.*]] = load i8, ptr [[TMP88]], align 1 ; CHECK-MAXBW-NEXT: [[TMP90:%.*]] = insertelement <16 x i8> [[TMP86]], i8 [[TMP89]], i32 14 +; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] +; CHECK-MAXBW-NEXT: [[TMP147:%.*]] = load i8, ptr [[TMP146]], align 1 +; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = insertelement <16 x i8> [[TMP140]], i8 [[TMP147]], i32 14 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE28]] ; CHECK-MAXBW: pred.load.continue28: ; CHECK-MAXBW-NEXT: [[TMP91:%.*]] = phi <16 x i8> [ [[TMP86]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP90]], [[PRED_LOAD_IF27]] ] +; CHECK-MAXBW-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP140]], [[PRED_LOAD_CONTINUE26]] ], [ [[TMP148]], [[PRED_LOAD_IF27]] ] ; CHECK-MAXBW-NEXT: [[TMP92:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE30:%.*]] +; CHECK-MAXBW-NEXT: br i1 [[TMP92]], label [[PRED_LOAD_IF29:%.*]], label [[PRED_LOAD_CONTINUE62]] ; CHECK-MAXBW: pred.load.if29: +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 15 ; CHECK-MAXBW-NEXT: [[TMP93:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]] ; CHECK-MAXBW-NEXT: [[TMP94:%.*]] = load i8, ptr [[TMP93]], align 1 ; CHECK-MAXBW-NEXT: [[TMP95:%.*]] = insertelement <16 x i8> [[TMP91]], i8 [[TMP94]], i32 15 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE30]] -; CHECK-MAXBW: pred.load.continue30: -; CHECK-MAXBW-NEXT: [[TMP96:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] -; CHECK-MAXBW-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP96]] to <16 x i32> -; CHECK-MAXBW-NEXT: [[TMP98:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0 -; CHECK-MAXBW-NEXT: br i1 [[TMP98]], label [[PRED_LOAD_IF31:%.*]], label [[PRED_LOAD_CONTINUE32:%.*]] -; CHECK-MAXBW: pred.load.if31: -; CHECK-MAXBW-NEXT: [[TMP99:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]] -; CHECK-MAXBW-NEXT: [[TMP100:%.*]] = load i8, ptr [[TMP99]], align 1 -; CHECK-MAXBW-NEXT: [[TMP101:%.*]] = insertelement <16 x i8> poison, i8 [[TMP100]], i32 0 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE32]] -; CHECK-MAXBW: pred.load.continue32: -; CHECK-MAXBW-NEXT: [[TMP102:%.*]] = phi <16 x i8> [ poison, [[PRED_LOAD_CONTINUE30]] ], [ [[TMP101]], [[PRED_LOAD_IF31]] ] -; CHECK-MAXBW-NEXT: [[TMP103:%.*]] = extractelement <16 x i1> [[TMP16]], i32 1 -; CHECK-MAXBW-NEXT: br i1 [[TMP103]], label [[PRED_LOAD_IF33:%.*]], label [[PRED_LOAD_CONTINUE34:%.*]] -; CHECK-MAXBW: pred.load.if33: -; CHECK-MAXBW-NEXT: [[TMP104:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP1]] -; CHECK-MAXBW-NEXT: [[TMP105:%.*]] = load i8, ptr [[TMP104]], align 1 -; CHECK-MAXBW-NEXT: [[TMP106:%.*]] = insertelement <16 x i8> [[TMP102]], i8 [[TMP105]], i32 1 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE34]] -; CHECK-MAXBW: pred.load.continue34: -; CHECK-MAXBW-NEXT: [[TMP107:%.*]] = phi <16 x i8> [ [[TMP102]], [[PRED_LOAD_CONTINUE32]] ], [ [[TMP106]], [[PRED_LOAD_IF33]] ] -; CHECK-MAXBW-NEXT: [[TMP108:%.*]] = extractelement <16 x i1> [[TMP16]], i32 2 -; CHECK-MAXBW-NEXT: br i1 [[TMP108]], label [[PRED_LOAD_IF35:%.*]], label [[PRED_LOAD_CONTINUE36:%.*]] -; CHECK-MAXBW: pred.load.if35: -; CHECK-MAXBW-NEXT: [[TMP109:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP2]] -; CHECK-MAXBW-NEXT: [[TMP110:%.*]] = load i8, ptr [[TMP109]], align 1 -; CHECK-MAXBW-NEXT: [[TMP111:%.*]] = insertelement <16 x i8> [[TMP107]], i8 [[TMP110]], i32 2 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE36]] -; CHECK-MAXBW: pred.load.continue36: -; CHECK-MAXBW-NEXT: [[TMP112:%.*]] = phi <16 x i8> [ [[TMP107]], [[PRED_LOAD_CONTINUE34]] ], [ [[TMP111]], [[PRED_LOAD_IF35]] ] -; CHECK-MAXBW-NEXT: [[TMP113:%.*]] = extractelement <16 x i1> [[TMP16]], i32 3 -; CHECK-MAXBW-NEXT: br i1 [[TMP113]], label [[PRED_LOAD_IF37:%.*]], label [[PRED_LOAD_CONTINUE38:%.*]] -; CHECK-MAXBW: pred.load.if37: -; CHECK-MAXBW-NEXT: [[TMP114:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]] -; CHECK-MAXBW-NEXT: [[TMP115:%.*]] = load i8, ptr [[TMP114]], align 1 -; CHECK-MAXBW-NEXT: [[TMP116:%.*]] = insertelement <16 x i8> [[TMP112]], i8 [[TMP115]], i32 3 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE38]] -; CHECK-MAXBW: pred.load.continue38: -; CHECK-MAXBW-NEXT: [[TMP117:%.*]] = phi <16 x i8> [ [[TMP112]], [[PRED_LOAD_CONTINUE36]] ], [ [[TMP116]], [[PRED_LOAD_IF37]] ] -; CHECK-MAXBW-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP16]], i32 4 -; CHECK-MAXBW-NEXT: br i1 [[TMP118]], label [[PRED_LOAD_IF39:%.*]], label [[PRED_LOAD_CONTINUE40:%.*]] -; CHECK-MAXBW: pred.load.if39: -; CHECK-MAXBW-NEXT: [[TMP119:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP4]] -; CHECK-MAXBW-NEXT: [[TMP120:%.*]] = load i8, ptr [[TMP119]], align 1 -; CHECK-MAXBW-NEXT: [[TMP121:%.*]] = insertelement <16 x i8> [[TMP117]], i8 [[TMP120]], i32 4 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE40]] -; CHECK-MAXBW: pred.load.continue40: -; CHECK-MAXBW-NEXT: [[TMP122:%.*]] = phi <16 x i8> [ [[TMP117]], [[PRED_LOAD_CONTINUE38]] ], [ [[TMP121]], [[PRED_LOAD_IF39]] ] -; CHECK-MAXBW-NEXT: [[TMP123:%.*]] = extractelement <16 x i1> [[TMP16]], i32 5 -; CHECK-MAXBW-NEXT: br i1 [[TMP123]], label [[PRED_LOAD_IF41:%.*]], label [[PRED_LOAD_CONTINUE42:%.*]] -; CHECK-MAXBW: pred.load.if41: -; CHECK-MAXBW-NEXT: [[TMP124:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP5]] -; CHECK-MAXBW-NEXT: [[TMP125:%.*]] = load i8, ptr [[TMP124]], align 1 -; CHECK-MAXBW-NEXT: [[TMP126:%.*]] = insertelement <16 x i8> [[TMP122]], i8 [[TMP125]], i32 5 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE42]] -; CHECK-MAXBW: pred.load.continue42: -; CHECK-MAXBW-NEXT: [[TMP127:%.*]] = phi <16 x i8> [ [[TMP122]], [[PRED_LOAD_CONTINUE40]] ], [ [[TMP126]], [[PRED_LOAD_IF41]] ] -; CHECK-MAXBW-NEXT: [[TMP128:%.*]] = extractelement <16 x i1> [[TMP16]], i32 6 -; CHECK-MAXBW-NEXT: br i1 [[TMP128]], label [[PRED_LOAD_IF43:%.*]], label [[PRED_LOAD_CONTINUE44:%.*]] -; CHECK-MAXBW: pred.load.if43: -; CHECK-MAXBW-NEXT: [[TMP129:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]] -; CHECK-MAXBW-NEXT: [[TMP130:%.*]] = load i8, ptr [[TMP129]], align 1 -; CHECK-MAXBW-NEXT: [[TMP131:%.*]] = insertelement <16 x i8> [[TMP127]], i8 [[TMP130]], i32 6 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE44]] -; CHECK-MAXBW: pred.load.continue44: -; CHECK-MAXBW-NEXT: [[TMP132:%.*]] = phi <16 x i8> [ [[TMP127]], [[PRED_LOAD_CONTINUE42]] ], [ [[TMP131]], [[PRED_LOAD_IF43]] ] -; CHECK-MAXBW-NEXT: [[TMP133:%.*]] = extractelement <16 x i1> [[TMP16]], i32 7 -; CHECK-MAXBW-NEXT: br i1 [[TMP133]], label [[PRED_LOAD_IF45:%.*]], label [[PRED_LOAD_CONTINUE46:%.*]] -; CHECK-MAXBW: pred.load.if45: -; CHECK-MAXBW-NEXT: [[TMP134:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP7]] -; CHECK-MAXBW-NEXT: [[TMP135:%.*]] = load i8, ptr [[TMP134]], align 1 -; CHECK-MAXBW-NEXT: [[TMP136:%.*]] = insertelement <16 x i8> [[TMP132]], i8 [[TMP135]], i32 7 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE46]] -; CHECK-MAXBW: pred.load.continue46: -; CHECK-MAXBW-NEXT: [[TMP137:%.*]] = phi <16 x i8> [ [[TMP132]], [[PRED_LOAD_CONTINUE44]] ], [ [[TMP136]], [[PRED_LOAD_IF45]] ] -; CHECK-MAXBW-NEXT: [[TMP138:%.*]] = extractelement <16 x i1> [[TMP16]], i32 8 -; CHECK-MAXBW-NEXT: br i1 [[TMP138]], label [[PRED_LOAD_IF47:%.*]], label [[PRED_LOAD_CONTINUE48:%.*]] -; CHECK-MAXBW: pred.load.if47: -; CHECK-MAXBW-NEXT: [[TMP139:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP8]] -; CHECK-MAXBW-NEXT: [[TMP140:%.*]] = load i8, ptr [[TMP139]], align 1 -; CHECK-MAXBW-NEXT: [[TMP141:%.*]] = insertelement <16 x i8> [[TMP137]], i8 [[TMP140]], i32 8 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE48]] -; CHECK-MAXBW: pred.load.continue48: -; CHECK-MAXBW-NEXT: [[TMP142:%.*]] = phi <16 x i8> [ [[TMP137]], [[PRED_LOAD_CONTINUE46]] ], [ [[TMP141]], [[PRED_LOAD_IF47]] ] -; CHECK-MAXBW-NEXT: [[TMP143:%.*]] = extractelement <16 x i1> [[TMP16]], i32 9 -; CHECK-MAXBW-NEXT: br i1 [[TMP143]], label [[PRED_LOAD_IF49:%.*]], label [[PRED_LOAD_CONTINUE50:%.*]] -; CHECK-MAXBW: pred.load.if49: -; CHECK-MAXBW-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-MAXBW-NEXT: [[TMP145:%.*]] = load i8, ptr [[TMP144]], align 1 -; CHECK-MAXBW-NEXT: [[TMP146:%.*]] = insertelement <16 x i8> [[TMP142]], i8 [[TMP145]], i32 9 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE50]] -; CHECK-MAXBW: pred.load.continue50: -; CHECK-MAXBW-NEXT: [[TMP147:%.*]] = phi <16 x i8> [ [[TMP142]], [[PRED_LOAD_CONTINUE48]] ], [ [[TMP146]], [[PRED_LOAD_IF49]] ] -; CHECK-MAXBW-NEXT: [[TMP148:%.*]] = extractelement <16 x i1> [[TMP16]], i32 10 -; CHECK-MAXBW-NEXT: br i1 [[TMP148]], label [[PRED_LOAD_IF51:%.*]], label [[PRED_LOAD_CONTINUE52:%.*]] -; CHECK-MAXBW: pred.load.if51: -; CHECK-MAXBW-NEXT: [[TMP149:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]] -; CHECK-MAXBW-NEXT: [[TMP150:%.*]] = load i8, ptr [[TMP149]], align 1 -; CHECK-MAXBW-NEXT: [[TMP151:%.*]] = insertelement <16 x i8> [[TMP147]], i8 [[TMP150]], i32 10 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE52]] -; CHECK-MAXBW: pred.load.continue52: -; CHECK-MAXBW-NEXT: [[TMP152:%.*]] = phi <16 x i8> [ [[TMP147]], [[PRED_LOAD_CONTINUE50]] ], [ [[TMP151]], [[PRED_LOAD_IF51]] ] -; CHECK-MAXBW-NEXT: [[TMP153:%.*]] = extractelement <16 x i1> [[TMP16]], i32 11 -; CHECK-MAXBW-NEXT: br i1 [[TMP153]], label [[PRED_LOAD_IF53:%.*]], label [[PRED_LOAD_CONTINUE54:%.*]] -; CHECK-MAXBW: pred.load.if53: -; CHECK-MAXBW-NEXT: [[TMP154:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP155:%.*]] = load i8, ptr [[TMP154]], align 1 -; CHECK-MAXBW-NEXT: [[TMP156:%.*]] = insertelement <16 x i8> [[TMP152]], i8 [[TMP155]], i32 11 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE54]] -; CHECK-MAXBW: pred.load.continue54: -; CHECK-MAXBW-NEXT: [[TMP157:%.*]] = phi <16 x i8> [ [[TMP152]], [[PRED_LOAD_CONTINUE52]] ], [ [[TMP156]], [[PRED_LOAD_IF53]] ] -; CHECK-MAXBW-NEXT: [[TMP158:%.*]] = extractelement <16 x i1> [[TMP16]], i32 12 -; CHECK-MAXBW-NEXT: br i1 [[TMP158]], label [[PRED_LOAD_IF55:%.*]], label [[PRED_LOAD_CONTINUE56:%.*]] -; CHECK-MAXBW: pred.load.if55: -; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]] -; CHECK-MAXBW-NEXT: [[TMP160:%.*]] = load i8, ptr [[TMP159]], align 1 -; CHECK-MAXBW-NEXT: [[TMP161:%.*]] = insertelement <16 x i8> [[TMP157]], i8 [[TMP160]], i32 12 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE56]] -; CHECK-MAXBW: pred.load.continue56: -; CHECK-MAXBW-NEXT: [[TMP162:%.*]] = phi <16 x i8> [ [[TMP157]], [[PRED_LOAD_CONTINUE54]] ], [ [[TMP161]], [[PRED_LOAD_IF55]] ] -; CHECK-MAXBW-NEXT: [[TMP163:%.*]] = extractelement <16 x i1> [[TMP16]], i32 13 -; CHECK-MAXBW-NEXT: br i1 [[TMP163]], label [[PRED_LOAD_IF57:%.*]], label [[PRED_LOAD_CONTINUE58:%.*]] -; CHECK-MAXBW: pred.load.if57: -; CHECK-MAXBW-NEXT: [[TMP164:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP13]] -; CHECK-MAXBW-NEXT: [[TMP165:%.*]] = load i8, ptr [[TMP164]], align 1 -; CHECK-MAXBW-NEXT: [[TMP166:%.*]] = insertelement <16 x i8> [[TMP162]], i8 [[TMP165]], i32 13 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE58]] -; CHECK-MAXBW: pred.load.continue58: -; CHECK-MAXBW-NEXT: [[TMP167:%.*]] = phi <16 x i8> [ [[TMP162]], [[PRED_LOAD_CONTINUE56]] ], [ [[TMP166]], [[PRED_LOAD_IF57]] ] -; CHECK-MAXBW-NEXT: [[TMP168:%.*]] = extractelement <16 x i1> [[TMP16]], i32 14 -; CHECK-MAXBW-NEXT: br i1 [[TMP168]], label [[PRED_LOAD_IF59:%.*]], label [[PRED_LOAD_CONTINUE60:%.*]] -; CHECK-MAXBW: pred.load.if59: -; CHECK-MAXBW-NEXT: [[TMP169:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]] -; CHECK-MAXBW-NEXT: [[TMP170:%.*]] = load i8, ptr [[TMP169]], align 1 -; CHECK-MAXBW-NEXT: [[TMP171:%.*]] = insertelement <16 x i8> [[TMP167]], i8 [[TMP170]], i32 14 -; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE60]] -; CHECK-MAXBW: pred.load.continue60: -; CHECK-MAXBW-NEXT: [[TMP172:%.*]] = phi <16 x i8> [ [[TMP167]], [[PRED_LOAD_CONTINUE58]] ], [ [[TMP171]], [[PRED_LOAD_IF59]] ] -; CHECK-MAXBW-NEXT: [[TMP173:%.*]] = extractelement <16 x i1> [[TMP16]], i32 15 -; CHECK-MAXBW-NEXT: br i1 [[TMP173]], label [[PRED_LOAD_IF61:%.*]], label [[PRED_LOAD_CONTINUE62]] -; CHECK-MAXBW: pred.load.if61: ; CHECK-MAXBW-NEXT: [[TMP174:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]] ; CHECK-MAXBW-NEXT: [[TMP175:%.*]] = load i8, ptr [[TMP174]], align 1 ; CHECK-MAXBW-NEXT: [[TMP176:%.*]] = insertelement <16 x i8> [[TMP172]], i8 [[TMP175]], i32 15 ; CHECK-MAXBW-NEXT: br label [[PRED_LOAD_CONTINUE62]] -; CHECK-MAXBW: pred.load.continue62: -; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE60]] ], [ [[TMP176]], [[PRED_LOAD_IF61]] ] +; CHECK-MAXBW: pred.load.continue30: +; CHECK-MAXBW-NEXT: [[TMP159:%.*]] = phi <16 x i8> [ [[TMP91]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP95]], [[PRED_LOAD_IF29]] ] +; CHECK-MAXBW-NEXT: [[TMP177:%.*]] = phi <16 x i8> [ [[TMP172]], [[PRED_LOAD_CONTINUE28]] ], [ [[TMP176]], [[PRED_LOAD_IF29]] ] ; CHECK-MAXBW-NEXT: [[TMP178:%.*]] = sext <16 x i8> [[TMP177]] to <16 x i32> +; CHECK-MAXBW-NEXT: [[TMP97:%.*]] = sext <16 x i8> [[TMP159]] to <16 x i32> ; CHECK-MAXBW-NEXT: [[TMP179:%.*]] = mul nsw <16 x i32> [[TMP178]], [[TMP97]] ; CHECK-MAXBW-NEXT: [[TMP180:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP179]], <16 x i32> zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP180]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 37eac89acfd11..ab593f6f8bb6b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1370,10 +1370,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP18]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll index f2e3b708d7820..61da142ad376c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll @@ -1,6 +1,8 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --version 4 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1 ; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4 +; RUN: opt -S --passes=loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize < %s | FileCheck %s -check-prefix CHECK-TF +; RUN: opt -S --passes=forceattrs,loop-vectorize -enable-wide-lane-mask -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-attribute=optsize < %s | FileCheck %s -check-prefix CHECK-UF1 target triple = "aarch64-unknown-linux" @@ -101,6 +103,49 @@ define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, ; CHECK-UF4-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-UF4: middle.block: ; +; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5 +; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-TF-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4 +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP9]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], [[ACTIVE_LANE_MASK1]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-TF-NEXT: [[TMP14:%.*]] = mul [[WIDE_MASKED_LOAD2]], splat (i8 3) +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4 +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr align 1 [[TMP15]], [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP14]], ptr align 1 [[TMP18]], [[ACTIVE_LANE_MASK1]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-TF-NEXT: [[TMP19]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-TF-NEXT: [[TMP20]] = call @llvm.vector.extract.nxv16i1.nxv32i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-TF: middle.block: +; entry: br label %for.body @@ -222,6 +267,52 @@ define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonl ; CHECK-UF4-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-UF4: middle.block: ; +; CHECK-TF-LABEL: define void @scalable_wide_active_lane_mask_double( +; CHECK-TF-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-TF-NEXT: entry: +; CHECK-TF-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-TF-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK-TF: for.body.preheader: +; CHECK-TF-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-TF: vector.ph: +; CHECK-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-TF-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-TF-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-TF-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-TF-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-TF-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-TF-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-TF: vector.body: +; CHECK-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-TF-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1 +; CHECK-TF-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP9]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-TF-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP12]], [[ACTIVE_LANE_MASK1]], poison) +; CHECK-TF-NEXT: [[TMP13:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-TF-NEXT: [[TMP14:%.*]] = fmul [[WIDE_MASKED_LOAD2]], splat (double 3.000000e+00) +; CHECK-TF-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-TF-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-TF-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1 +; CHECK-TF-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP13]], ptr align 8 [[TMP15]], [[ACTIVE_LANE_MASK]]) +; CHECK-TF-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr align 8 [[TMP18]], [[ACTIVE_LANE_MASK1]]) +; CHECK-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-TF-NEXT: [[TMP19]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-TF-NEXT: [[TMP20]] = call @llvm.vector.extract.nxv2i1.nxv4i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-TF-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-TF-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-TF: middle.block: +; entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -243,14 +334,3 @@ for.end: attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" } -;. -; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -;. -; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-UF4: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 62e248bed85d9..0c3b987a74ece 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -124,6 +124,77 @@ exit: ret i32 %add } +; Test that we also get VPExpressions when there is predication. +define i32 @print_partial_reduction_predication(ptr %a, ptr %b, i64 %N) "target-features"="+sve" { +; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in ir<%N> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%4> = reduction-start-vector ir<0>, ir<0>, ir<4> +; CHECK-NEXT: EMIT vp<%5> = TC > VF ? TC - VF : 0 ir<%N> +; CHECK-NEXT: EMIT vp<%index.part.next> = VF * Part + ir<0> +; CHECK-NEXT: EMIT vp<%active.lane.mask.entry> = active lane mask vp<%index.part.next>, ir<%N>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: ACTIVE-LANE-MASK-PHI vp<%7> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%4>, vp<%11> (VF scaled by 1/4) +; CHECK-NEXT: vp<%8> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%8> +; CHECK-NEXT: vp<%9> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<%9>, vp<%7> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%8> +; CHECK-NEXT: vp<%10> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<%10>, vp<%7> +; CHECK-NEXT: EXPRESSION vp<%11> = vp<%7> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32), ) +; CHECK-NEXT: EMIT vp<%index.next> = add vp<%6>, vp<%1> +; CHECK-NEXT: EMIT vp<%12> = VF * Part + vp<%6> +; CHECK-NEXT: EMIT vp<%active.lane.mask.next> = active lane mask vp<%12>, vp<%5>, ir<1> +; CHECK-NEXT: EMIT vp<%13> = not vp<%active.lane.mask.next> +; CHECK-NEXT: EMIT branch-on-cond vp<%13> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%15> = compute-reduction-result ir<%accum>, vp<%11> +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%15> from middle.block) +; CHECK-NEXT: No successors +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %add = add i32 %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !1 + +exit: + ret i32 %add +} + + !0 = distinct !{!0, !2, !3} +!1 = distinct !{!1, !2, !4} !2 = !{!"llvm.loop.interleave.count", i32 1} !3 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} +!4 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll index 212a5c99676f4..877484f5159fd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll @@ -63,7 +63,7 @@ define void @predicated_uniform_load(ptr %src, i32 %n, ptr %dst, i1 %cond) { ; CHECK-NEXT: store i32 [[STORE]], ptr [[NBRBOXES]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i32 [[IV]], [[IBOX]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -114,7 +114,7 @@ define void @predicated_strided_store(ptr %start) { ; RVA23-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; RVA23-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; RVA23-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; RVA23-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; RVA23-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; RVA23: middle.block: ; RVA23-NEXT: br label [[LOOP:%.*]] ; RVA23: exit: @@ -141,7 +141,7 @@ define void @predicated_strided_store(ptr %start) { ; RVA23ZVL1024B-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; RVA23ZVL1024B-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; RVA23ZVL1024B-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; RVA23ZVL1024B-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; RVA23ZVL1024B-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; RVA23ZVL1024B: middle.block: ; RVA23ZVL1024B-NEXT: br label [[LOOP:%.*]] ; RVA23ZVL1024B: exit: @@ -185,16 +185,16 @@ define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr no ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P1:%.*]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2p0.nxv2p0( [[BROADCAST_SPLAT1]], align 8 [[TMP5]], splat (i1 true), i32 [[TMP3]]) ; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[P2:%.*]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[P3:%.*]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P3:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, ptr [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0( zeroinitializer, align 4 [[TMP7]], splat (i1 true), i32 [[TMP3]]) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0( zeroinitializer, align 4 [[TMP7]], splat (i1 true), i32 [[TMP3]]) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0( zeroinitializer, align 1 [[TMP7]], splat (i1 true), i32 [[TMP3]]) ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: exit: diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll index d08ca8c99e8ba..c37bf74f9c1b0 100644 --- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll +++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll @@ -8,14 +8,14 @@ define void @pr63340(ptr %A, ptr %B) { ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 1 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[OFFSET_IDX]] -; CHECK-NEXT: store <4 x ptr> [[DOTSPLAT]], ptr [[TMP1]], align 8 +; CHECK-NEXT: store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -55,11 +55,11 @@ define void @wide_gep_index_invariant(ptr noalias %dst, ptr noalias %src, i64 %n ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr [[TMP0]], i64 [[N]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, <4 x ptr> [[BROADCAST_SPLAT]], i64 [[N]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: store <4 x ptr> [[TMP1]], ptr [[TMP2]], align 8 +; CHECK-NEXT: store <4 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP2]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] diff --git a/llvm/test/tools/llc/save-stats.ll b/llvm/test/tools/llc/save-stats.ll index 4950625c809cc..a5769f86648dc 100644 --- a/llvm/test/tools/llc/save-stats.ll +++ b/llvm/test/tools/llc/save-stats.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts +; RUN: rm -rf %t.dir && mkdir -p %t.dir && cd %t.dir ; RUN: llc --save-stats=obj -o %t.s %s && cat %t.stats | FileCheck %s ; RUN: llc --save-stats=cwd -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s ; RUN: llc --save-stats -o %t.s %s && cat %{t:stem}.tmp.stats | FileCheck %s diff --git a/llvm/unittests/Analysis/AliasAnalysisTest.cpp b/llvm/unittests/Analysis/AliasAnalysisTest.cpp index 06066b1b92c51..a28d318ab32c8 100644 --- a/llvm/unittests/Analysis/AliasAnalysisTest.cpp +++ b/llvm/unittests/Analysis/AliasAnalysisTest.cpp @@ -232,18 +232,18 @@ TEST_F(AliasAnalysisTest, BatchAAPhiCycles) { LLVMContext C; SMDiagnostic Err; std::unique_ptr M = parseAssemblyString(R"( - define void @f(i8* noalias %a, i1 %c) { + define void @f(ptr noalias %a, i1 %c) { entry: br label %loop loop: - %phi = phi i8* [ null, %entry ], [ %a2, %loop ] + %phi = phi ptr [ null, %entry ], [ %a2, %loop ] %offset1 = phi i64 [ 0, %entry ], [ %offset2, %loop] %offset2 = add i64 %offset1, 1 - %a1 = getelementptr i8, i8* %a, i64 %offset1 - %a2 = getelementptr i8, i8* %a, i64 %offset2 - %s1 = select i1 %c, i8* %a1, i8* %phi - %s2 = select i1 %c, i8* %a2, i8* %a1 + %a1 = getelementptr i8, ptr %a, i64 %offset1 + %a2 = getelementptr i8, ptr %a, i64 %offset2 + %s1 = select i1 %c, ptr %a1, ptr %phi + %s2 = select i1 %c, ptr %a2, ptr %a1 br label %loop } )", Err, C); @@ -280,15 +280,15 @@ TEST_F(AliasAnalysisTest, BatchAAPhiAssumption) { LLVMContext C; SMDiagnostic Err; std::unique_ptr M = parseAssemblyString(R"( - define void @f(i8* %a.base, i8* %b.base, i1 %c) { + define void @f(ptr %a.base, ptr %b.base, i1 %c) { entry: br label %loop loop: - %a = phi i8* [ %a.next, %loop ], [ %a.base, %entry ] - %b = phi i8* [ %b.next, %loop ], [ %b.base, %entry ] - %a.next = getelementptr i8, i8* %a, i64 1 - %b.next = getelementptr i8, i8* %b, i64 1 + %a = phi ptr [ %a.next, %loop ], [ %a.base, %entry ] + %b = phi ptr [ %b.next, %loop ], [ %b.base, %entry ] + %a.next = getelementptr i8, ptr %a, i64 1 + %b.next = getelementptr i8, ptr %b, i64 1 br label %loop } )", Err, C); @@ -318,16 +318,16 @@ TEST_F(AliasAnalysisTest, PartialAliasOffset) { LLVMContext C; SMDiagnostic Err; std::unique_ptr M = parseAssemblyString(R"( - define void @foo(float* %arg, i32 %i) { + define void @foo(ptr %arg, i32 %i) { bb: %i2 = zext i32 %i to i64 - %i3 = getelementptr inbounds float, float* %arg, i64 %i2 - %i4 = bitcast float* %i3 to <2 x float>* - %L1 = load <2 x float>, <2 x float>* %i4, align 16 + %i3 = getelementptr inbounds float, ptr %arg, i64 %i2 + %i4 = bitcast ptr %i3 to ptr + %L1 = load <2 x float>, ptr %i4, align 16 %i7 = add nuw nsw i32 %i, 1 %i8 = zext i32 %i7 to i64 - %i9 = getelementptr inbounds float, float* %arg, i64 %i8 - %L2 = load float, float* %i9, align 4 + %i9 = getelementptr inbounds float, ptr %arg, i64 %i8 + %L2 = load float, ptr %i9, align 4 ret void } )", @@ -353,11 +353,11 @@ TEST_F(AliasAnalysisTest, PartialAliasOffsetSign) { LLVMContext C; SMDiagnostic Err; std::unique_ptr M = parseAssemblyString(R"( - define void @f(i64* %p) { - %L1 = load i64, i64* %p - %p.i8 = bitcast i64* %p to i8* - %q = getelementptr i8, i8* %p.i8, i32 1 - %L2 = load i8, i8* %q + define void @f(ptr %p) { + %L1 = load i64, ptr %p + %p.i8 = bitcast ptr %p to ptr + %q = getelementptr i8, ptr %p.i8, i32 1 + %L2 = load i8, ptr %q ret void } )", @@ -388,10 +388,10 @@ class AAPassInfraTest : public testing::Test { public: AAPassInfraTest() - : M(parseAssemblyString("define i32 @f(i32* %x, i32* %y) {\n" + : M(parseAssemblyString("define i32 @f(ptr %x, ptr %y) {\n" "entry:\n" - " %lx = load i32, i32* %x\n" - " %ly = load i32, i32* %y\n" + " %lx = load i32, ptr %x\n" + " %ly = load i32, ptr %y\n" " %sum = add i32 %lx, %ly\n" " ret i32 %sum\n" "}\n", diff --git a/llvm/unittests/Analysis/AliasSetTrackerTest.cpp b/llvm/unittests/Analysis/AliasSetTrackerTest.cpp index e784e6eefb79c..b5adc84185c96 100644 --- a/llvm/unittests/Analysis/AliasSetTrackerTest.cpp +++ b/llvm/unittests/Analysis/AliasSetTrackerTest.cpp @@ -26,13 +26,13 @@ TEST(AliasSetTracker, AliasUnknownInst) { ; Function Attrs: nounwind ssp uwtable define i32 @read_a() #0 { - %1 = load i32, i32* @a, align 4, !tbaa !3 + %1 = load i32, ptr @a, align 4, !tbaa !3 ret i32 %1 } ; Function Attrs: nounwind ssp uwtable define void @write_b() #0 { - store float 1.000000e+01, float* @b, align 4, !tbaa !7 + store float 1.000000e+01, ptr @b, align 4, !tbaa !7 ret void } @@ -72,7 +72,7 @@ TEST(AliasSetTracker, AliasUnknownInst) { AliasSetTracker AST(BAA); for (auto &BB : *Test) AST.add(BB); - // There should be 2 disjoint alias sets. 1 from each call. + // There should be 2 disjoint alias sets. 1 from each call. ASSERT_EQ((int)AST.getAliasSets().size(), 2); // Directly test aliasesUnknownInst. diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp index 5fd2ecc4f29b6..921e2aa8cd30b 100644 --- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp +++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp @@ -74,18 +74,18 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { EnableKnowledgeRetention.setValue(true); StringRef Head = "declare void @llvm.assume(i1)\n" - "declare void @func(i32*, i32*, i32*)\n" - "declare void @func1(i32*, i32*, i32*, i32*)\n" - "declare void @func_many(i32*) \"no-jump-tables\" nounwind " + "declare void @func(ptr, ptr, ptr)\n" + "declare void @func1(ptr, ptr, ptr, ptr)\n" + "declare void @func_many(ptr) \"no-jump-tables\" nounwind " "\"less-precise-fpmad\" willreturn norecurse\n" - "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3) {\n"; + "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3) {\n"; StringRef Tail = "ret void\n" "}"; std::vector>> Tests; Tests.push_back(std::make_pair( - "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align " - "8 noalias %P1, i32* align 8 noundef %P2)\n", + "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align " + "8 noalias %P1, ptr align 8 noundef %P2)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -103,11 +103,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { Attribute::AttrKind::Alignment, 4)); })); Tests.push_back(std::make_pair( - "call void @func1(i32* nonnull align 32 dereferenceable(48) %P, i32* " + "call void @func1(ptr nonnull align 32 dereferenceable(48) %P, ptr " "nonnull " - "align 8 dereferenceable(28) %P, i32* nonnull align 64 " + "align 8 dereferenceable(28) %P, ptr nonnull align 64 " "dereferenceable(4) " - "%P, i32* nonnull align 16 dereferenceable(12) %P)\n", + "%P, ptr nonnull align 16 dereferenceable(12) %P)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -127,7 +127,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { Attribute::AttrKind::Alignment, 64)); })); Tests.push_back(std::make_pair( - "call void @func_many(i32* align 8 noundef %P1) cold\n", [](Instruction *I) { + "call void @func_many(ptr align 8 noundef %P1) cold\n", [](Instruction *I) { ShouldPreserveAllAttributes.setValue(true); auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -142,11 +142,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { ASSERT_TRUE(hasMatchesExactlyAttributes(Assume, nullptr, "")); })); Tests.push_back(std::make_pair( - "call void @func1(i32* readnone align 32 " - "dereferenceable(48) noalias %P, i32* " - "align 8 dereferenceable(28) %P1, i32* align 64 " + "call void @func1(ptr readnone align 32 " + "dereferenceable(48) noalias %P, ptr " + "align 8 dereferenceable(28) %P1, ptr align 64 " "dereferenceable(4) " - "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n", + "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -178,11 +178,11 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { })); Tests.push_back(std::make_pair( - "call void @func1(i32* readnone align 32 " - "dereferenceable(48) noalias %P, i32* " - "align 8 dereferenceable(28) %P1, i32* align 64 " + "call void @func1(ptr readnone align 32 " + "dereferenceable(48) noalias %P, ptr " + "align 8 dereferenceable(28) %P1, ptr align 64 " "dereferenceable(4) " - "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n", + "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -204,8 +204,8 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { Attribute::AttrKind::Dereferenceable, 48)); })); Tests.push_back(std::make_pair( - "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align " - "8 noalias %P1, i32* %P1)\n", + "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align " + "8 noalias %P1, ptr %P1)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -251,18 +251,18 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { EnableKnowledgeRetention.setValue(true); StringRef Head = "declare void @llvm.assume(i1)\n" - "declare void @func(i32*, i32*, i32*)\n" - "declare void @func1(i32*, i32*, i32*, i32*)\n" - "declare void @func_many(i32*) \"no-jump-tables\" nounwind " + "declare void @func(ptr, ptr, ptr)\n" + "declare void @func1(ptr, ptr, ptr, ptr)\n" + "declare void @func_many(ptr) \"no-jump-tables\" nounwind " "\"less-precise-fpmad\" willreturn norecurse\n" - "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3) {\n"; + "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3) {\n"; StringRef Tail = "ret void\n" "}"; std::vector>> Tests; Tests.push_back(std::make_pair( - "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align " - "8 noalias %P1, i32* align 8 dereferenceable(8) %P2)\n", + "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align " + "8 noalias %P1, ptr align 8 dereferenceable(8) %P2)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -283,11 +283,11 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { {4, 4})); })); Tests.push_back(std::make_pair( - "call void @func1(i32* nonnull align 32 dereferenceable(48) %P, i32* " + "call void @func1(ptr nonnull align 32 dereferenceable(48) %P, ptr " "nonnull " - "align 8 dereferenceable(28) %P, i32* nonnull align 64 " + "align 8 dereferenceable(28) %P, ptr nonnull align 64 " "dereferenceable(4) " - "%P, i32* nonnull align 16 dereferenceable(12) %P)\n", + "%P, ptr nonnull align 16 dereferenceable(12) %P)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -310,7 +310,7 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { Map, Assume, {I->getOperand(0), Attribute::Alignment}, {64, 64})); })); Tests.push_back(std::make_pair( - "call void @func_many(i32* align 8 %P1) cold\n", [](Instruction *I) { + "call void @func_many(ptr align 8 %P1) cold\n", [](Instruction *I) { ShouldPreserveAllAttributes.setValue(true); auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -331,11 +331,11 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { ASSERT_TRUE(Map.empty()); })); Tests.push_back(std::make_pair( - "call void @func1(i32* readnone align 32 " - "dereferenceable(48) noalias %P, i32* " - "align 8 dereferenceable(28) %P1, i32* align 64 " + "call void @func1(ptr readnone align 32 " + "dereferenceable(48) noalias %P, ptr " + "align 8 dereferenceable(28) %P1, ptr align 64 " "dereferenceable(4) " - "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n", + "%P2, ptr nonnull align 16 dereferenceable(12) %P3)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -371,8 +371,8 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { /// Keep this test last as it modifies the function. Tests.push_back(std::make_pair( - "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align " - "8 noalias %P1, i32* %P2)\n", + "call void @func(ptr nonnull align 4 dereferenceable(16) %P, ptr align " + "8 noalias %P1, ptr %P2)\n", [](Instruction *I) { auto *Assume = buildAssumeFromInst(I); Assume->insertBefore(I->getIterator()); @@ -507,11 +507,11 @@ TEST(AssumeQueryAPI, AssumptionCache) { SMDiagnostic Err; std::unique_ptr Mod = parseAssemblyString( "declare void @llvm.assume(i1)\n" - "define void @test(i32* %P, i32* %P1, i32* %P2, i32* %P3, i1 %B) {\n" - "call void @llvm.assume(i1 true) [\"nonnull\"(i32* %P), \"align\"(i32* " - "%P2, i32 4), \"align\"(i32* %P, i32 8)]\n" - "call void @llvm.assume(i1 %B) [\"test\"(i32* %P1), " - "\"dereferenceable\"(i32* %P, i32 4)]\n" + "define void @test(ptr %P, ptr %P1, ptr %P2, ptr %P3, i1 %B) {\n" + "call void @llvm.assume(i1 true) [\"nonnull\"(ptr %P), \"align\"(ptr " + "%P2, i32 4), \"align\"(ptr %P, i32 8)]\n" + "call void @llvm.assume(i1 %B) [\"test\"(ptr %P1), " + "\"dereferenceable\"(ptr %P, i32 4)]\n" "ret void\n}\n", Err, C); if (!Mod) @@ -569,11 +569,11 @@ TEST(AssumeQueryAPI, Alignment) { SMDiagnostic Err; std::unique_ptr Mod = parseAssemblyString( "declare void @llvm.assume(i1)\n" - "define void @test(i32* %P, i32* %P1, i32* %P2, i32 %I3, i1 %B) {\n" - "call void @llvm.assume(i1 true) [\"align\"(i32* %P, i32 8, i32 %I3)]\n" - "call void @llvm.assume(i1 true) [\"align\"(i32* %P1, i32 %I3, i32 " + "define void @test(ptr %P, ptr %P1, ptr %P2, i32 %I3, i1 %B) {\n" + "call void @llvm.assume(i1 true) [\"align\"(ptr %P, i32 8, i32 %I3)]\n" + "call void @llvm.assume(i1 true) [\"align\"(ptr %P1, i32 %I3, i32 " "%I3)]\n" - "call void @llvm.assume(i1 true) [\"align\"(i32* %P2, i32 16, i32 8)]\n" + "call void @llvm.assume(i1 true) [\"align\"(ptr %P2, i32 16, i32 8)]\n" "ret void\n}\n", Err, C); if (!Mod) diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp index 17240a1c73bce..bf5afe8e79354 100644 --- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp +++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp @@ -1936,26 +1936,26 @@ TEST_F(CGSCCPassManagerTest, TestDeletionOfFunctionInNonTrivialRefSCC) { TEST_F(CGSCCPassManagerTest, TestInsertionOfNewNonTrivialCallEdge) { std::unique_ptr M = parseIR("define void @f1() {\n" "entry:\n" - " %a = bitcast void ()* @f4 to i8*\n" - " %b = bitcast void ()* @f2 to i8*\n" + " %a = bitcast ptr @f4 to ptr\n" + " %b = bitcast ptr @f2 to ptr\n" " ret void\n" "}\n" "define void @f2() {\n" "entry:\n" - " %a = bitcast void ()* @f1 to i8*\n" - " %b = bitcast void ()* @f3 to i8*\n" + " %a = bitcast ptr @f1 to ptr\n" + " %b = bitcast ptr @f3 to ptr\n" " ret void\n" "}\n" "define void @f3() {\n" "entry:\n" - " %a = bitcast void ()* @f2 to i8*\n" - " %b = bitcast void ()* @f4 to i8*\n" + " %a = bitcast ptr @f2 to ptr\n" + " %b = bitcast ptr @f4 to ptr\n" " ret void\n" "}\n" "define void @f4() {\n" "entry:\n" - " %a = bitcast void ()* @f3 to i8*\n" - " %b = bitcast void ()* @f1 to i8*\n" + " %a = bitcast ptr @f3 to ptr\n" + " %b = bitcast ptr @f1 to ptr\n" " ret void\n" "}\n"); diff --git a/llvm/unittests/Analysis/CaptureTrackingTest.cpp b/llvm/unittests/Analysis/CaptureTrackingTest.cpp index ea3f21efc014c..d7ee5252d50be 100644 --- a/llvm/unittests/Analysis/CaptureTrackingTest.cpp +++ b/llvm/unittests/Analysis/CaptureTrackingTest.cpp @@ -20,27 +20,27 @@ using namespace llvm; TEST(CaptureTracking, MaxUsesToExplore) { StringRef Assembly = R"( ; Function Attrs: nounwind ssp uwtable - declare void @doesnt_capture(i8* nocapture, i8* nocapture, i8* nocapture, - i8* nocapture, i8* nocapture) + declare void @doesnt_capture(ptr nocapture, ptr nocapture, ptr nocapture, + ptr nocapture, ptr nocapture) ; %arg has 5 uses - define void @test_few_uses(i8* %arg) { - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) + define void @test_few_uses(ptr %arg) { + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) ret void } ; %arg has 50 uses - define void @test_many_uses(i8* %arg) { - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) - call void @doesnt_capture(i8* %arg, i8* %arg, i8* %arg, i8* %arg, i8* %arg) + define void @test_many_uses(ptr %arg) { + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) + call void @doesnt_capture(ptr %arg, ptr %arg, ptr %arg, ptr %arg, ptr %arg) ret void } )"; @@ -85,12 +85,12 @@ struct CollectingCaptureTracker : public CaptureTracker { TEST(CaptureTracking, MultipleUsesInSameInstruction) { StringRef Assembly = R"( - declare void @call(i8*, i8*, i8*) + declare void @call(ptr, ptr, ptr) - define void @test(i8* %arg, i8** %ptr) { - call void @call(i8* %arg, i8* nocapture %arg, i8* %arg) [ "bundle"(i8* %arg) ] - cmpxchg i8** %ptr, i8* %arg, i8* %arg acq_rel monotonic - icmp eq i8* %arg, %arg + define void @test(ptr %arg, ptr %ptr) { + call void @call(ptr %arg, ptr nocapture %arg, ptr %arg) [ "bundle"(ptr %arg) ] + cmpxchg ptr %ptr, ptr %arg, ptr %arg acq_rel monotonic + icmp eq ptr %arg, %arg ret void } )"; diff --git a/llvm/unittests/Analysis/DDGTest.cpp b/llvm/unittests/Analysis/DDGTest.cpp index 7fcdfdb62da43..12944a3f0cf3f 100644 --- a/llvm/unittests/Analysis/DDGTest.cpp +++ b/llvm/unittests/Analysis/DDGTest.cpp @@ -51,7 +51,7 @@ TEST(DDGTest, getDependencies) { "target datalayout = \"e-m:e-i64:64-n32:64\"\n" "target triple = \"powerpc64le-unknown-linux-gnu\"\n" "\n" - "define dso_local void @foo(i32 signext %n, i32* noalias %A, i32* " + "define dso_local void @foo(i32 signext %n, ptr noalias %A, ptr " "noalias %B) {\n" "entry:\n" " %cmp1 = icmp sgt i32 %n, 0\n" @@ -64,16 +64,16 @@ TEST(DDGTest, getDependencies) { " for.body:\n" " %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ " "%indvars.iv.next, %for.body ]\n" - " %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv\n" + " %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\n" " %0 = trunc i64 %indvars.iv to i32\n" - " store i32 %0, i32* %arrayidx, align 4\n" + " store i32 %0, ptr %arrayidx, align 4\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" - " %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 " + " %arrayidx2 = getelementptr inbounds i32, ptr %A, i64 " "%indvars.iv.next\n" - " %1 = load i32, i32* %arrayidx2, align 4\n" + " %1 = load i32, ptr %arrayidx2, align 4\n" " %add3 = add nsw i32 %1, 1\n" - " %arrayidx5 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv\n" - " store i32 %add3, i32* %arrayidx5, align 4\n" + " %arrayidx5 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv\n" + " store i32 %add3, ptr %arrayidx5, align 4\n" " %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count\n" " br i1 %exitcond, label %for.body, label %for.end.loopexit\n" "\n" @@ -142,8 +142,8 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) { const char *ModuleStr = "target datalayout = \"e-m:e-i64:64-n32:64-v256:256:256-v512:512:512\"\n" "\n" - "define void @foo(float* noalias %A, float* noalias %B, float* noalias " - "%C, float* noalias %D, i32 signext %n) {\n" + "define void @foo(ptr noalias %A, ptr noalias %B, ptr noalias " + "%C, ptr noalias %D, i32 signext %n) {\n" "entry:\n" " %cmp1 = icmp sgt i32 %n, 0\n" " br i1 %cmp1, label %for.body.preheader, label %for.end\n" @@ -156,26 +156,26 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) { "%for.body.preheader, %if.end\n" " %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, " "%if.end ]\n" - " %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv\n" - " %loadASubI = load float, float* %arrayidx, align 4\n" - " %arrayidx2 = getelementptr inbounds float, float* %B, i64 " + " %arrayidx = getelementptr inbounds float, ptr %A, i64 %indvars.iv\n" + " %loadASubI = load float, ptr %arrayidx, align 4\n" + " %arrayidx2 = getelementptr inbounds float, ptr %B, i64 " "%indvars.iv\n" - " %loadBSubI = load float, float* %arrayidx2, align 4\n" + " %loadBSubI = load float, ptr %arrayidx2, align 4\n" " %add = fadd fast float %loadASubI, %loadBSubI\n" - " %arrayidx4 = getelementptr inbounds float, float* %A, i64 " + " %arrayidx4 = getelementptr inbounds float, ptr %A, i64 " "%indvars.iv\n" - " store float %add, float* %arrayidx4, align 4\n" - " %arrayidx6 = getelementptr inbounds float, float* %A, i64 " + " store float %add, ptr %arrayidx4, align 4\n" + " %arrayidx6 = getelementptr inbounds float, ptr %A, i64 " "%indvars.iv\n" - " %0 = load float, float* %arrayidx6, align 4\n" + " %0 = load float, ptr %arrayidx6, align 4\n" " %add7 = fadd fast float %0, 1.000000e+00\n" " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" - " %arrayidx10 = getelementptr inbounds float, float* %B, i64 " + " %arrayidx10 = getelementptr inbounds float, ptr %B, i64 " "%indvars.iv.next\n" - " store float %add7, float* %arrayidx10, align 4\n" - " %arrayidx12 = getelementptr inbounds float, float* %A, i64 " + " store float %add7, ptr %arrayidx10, align 4\n" + " %arrayidx12 = getelementptr inbounds float, ptr %A, i64 " "%indvars.iv\n" - " %1 = load float, float* %arrayidx12, align 4\n" + " %1 = load float, ptr %arrayidx12, align 4\n" " %cmp13 = fcmp fast ogt float %1, 1.000000e+02\n" " br i1 %cmp13, label %if.then, label %if.else\n" "\n" @@ -188,7 +188,7 @@ TEST(DDGTest, avoidDuplicateEdgesToFromPiBlocks) { "if.end: ; preds = %if.else, " "%if.then\n" " %ff.0 = phi float [ %add, %if.then ], [ %add7, %if.else ]\n" - " store float %ff.0, float* %C, align 4\n" + " store float %ff.0, ptr %C, align 4\n" " %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count\n" " br i1 %exitcond, label %for.body, label %for.end.loopexit\n" "\n" diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp index 497da8f3fc70b..dc5d0a8a7ca9b 100644 --- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp +++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp @@ -457,7 +457,7 @@ define internal void @callee() { ret void } -define i32 @caller() personality i32 (...)* @__gxx_personality_v0 { +define i32 @caller() personality ptr @__gxx_personality_v0 { entry: invoke void @callee() to label %cont unwind label %exc @@ -466,7 +466,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 { ret i32 0 exc: - %exn = landingpad {i8*, i32} + %exn = landingpad {ptr, i32} cleanup ret i32 1 } @@ -498,7 +498,7 @@ TEST_F(FunctionPropertiesAnalysisTest, InvokeUnreachableHandler) { R"IR( declare void @might_throw() -define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 { +define internal i32 @callee() personality ptr @__gxx_personality_v0 { entry: invoke void @might_throw() to label %cont unwind label %exc @@ -507,12 +507,12 @@ define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 { ret i32 0 exc: - %exn = landingpad {i8*, i32} + %exn = landingpad {ptr, i32} cleanup - resume { i8*, i32 } %exn + resume { ptr, i32 } %exn } -define i32 @caller() personality i32 (...)* @__gxx_personality_v0 { +define i32 @caller() personality ptr @__gxx_personality_v0 { entry: %X = invoke i32 @callee() to label %cont unwind label %Handler @@ -521,7 +521,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 { ret i32 %X Handler: - %exn = landingpad {i8*, i32} + %exn = landingpad {ptr, i32} cleanup ret i32 1 } @@ -554,7 +554,7 @@ TEST_F(FunctionPropertiesAnalysisTest, Rethrow) { R"IR( declare void @might_throw() -define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 { +define internal i32 @callee() personality ptr @__gxx_personality_v0 { entry: invoke void @might_throw() to label %cont unwind label %exc @@ -563,12 +563,12 @@ define internal i32 @callee() personality i32 (...)* @__gxx_personality_v0 { ret i32 0 exc: - %exn = landingpad {i8*, i32} + %exn = landingpad {ptr, i32} cleanup - resume { i8*, i32 } %exn + resume { ptr, i32 } %exn } -define i32 @caller() personality i32 (...)* @__gxx_personality_v0 { +define i32 @caller() personality ptr @__gxx_personality_v0 { entry: %X = invoke i32 @callee() to label %cont unwind label %Handler @@ -577,7 +577,7 @@ define i32 @caller() personality i32 (...)* @__gxx_personality_v0 { ret i32 %X Handler: - %exn = landingpad {i8*, i32} + %exn = landingpad {ptr, i32} cleanup ret i32 1 } @@ -612,18 +612,18 @@ declare void @external_func() @exception_type2 = external global i8 -define internal void @inner() personality i8* null { +define internal void @inner() personality ptr null { invoke void @external_func() to label %cont unwind label %lpad cont: ret void lpad: %lp = landingpad i32 - catch i8* @exception_type1 + catch ptr @exception_type1 resume i32 %lp } -define void @outer() personality i8* null { +define void @outer() personality ptr null { invoke void @inner() to label %cont unwind label %lpad cont: @@ -631,7 +631,7 @@ define void @outer() personality i8* null { lpad: %lp = landingpad i32 cleanup - catch i8* @exception_type2 + catch ptr @exception_type2 resume i32 %lp } @@ -666,18 +666,18 @@ declare void @external_func() @exception_type2 = external global i8 -define internal void @inner() personality i8* null { +define internal void @inner() personality ptr null { invoke void @external_func() to label %cont unwind label %lpad cont: ret void lpad: %lp = landingpad i32 - catch i8* @exception_type1 + catch ptr @exception_type1 resume i32 %lp } -define void @outer(i32 %a) personality i8* null { +define void @outer(i32 %a) personality ptr null { entry: %i = icmp slt i32 %a, 0 br i1 %i, label %if.then, label %cont @@ -689,7 +689,7 @@ if.then: lpad: %lp = landingpad i32 cleanup - catch i8* @exception_type2 + catch ptr @exception_type2 resume i32 %lp } @@ -931,9 +931,9 @@ TEST_F(FunctionPropertiesAnalysisTest, DetailedOperandCount) { @a = global i64 1 define i64 @f1(i64 %e) { - %b = load i64, i64* @a + %b = load i64, ptr @a %c = add i64 %b, 2 - %d = call i64 asm "mov $1,$0", "=r,r" (i64 %c) + %d = call i64 asm "mov $1,$0", "=r,r" (i64 %c) %f = add i64 %d, %e ret i64 %f } diff --git a/llvm/unittests/Analysis/LazyCallGraphTest.cpp b/llvm/unittests/Analysis/LazyCallGraphTest.cpp index 4a4ff3242c4c4..5c0bfbd74555e 100644 --- a/llvm/unittests/Analysis/LazyCallGraphTest.cpp +++ b/llvm/unittests/Analysis/LazyCallGraphTest.cpp @@ -142,78 +142,78 @@ static const char DiamondOfTriangles[] = static const char DiamondOfTrianglesRefGraph[] = "define void @a1() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @a2, void ()** %a\n" - " store void ()* @b2, void ()** %a\n" - " store void ()* @c3, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @a2, ptr %a\n" + " store ptr @b2, ptr %a\n" + " store ptr @c3, ptr %a\n" " ret void\n" "}\n" "define void @a2() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @a3, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @a3, ptr %a\n" " ret void\n" "}\n" "define void @a3() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @a1, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @a1, ptr %a\n" " ret void\n" "}\n" "define void @b1() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @b2, void ()** %a\n" - " store void ()* @d3, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @b2, ptr %a\n" + " store ptr @d3, ptr %a\n" " ret void\n" "}\n" "define void @b2() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @b3, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @b3, ptr %a\n" " ret void\n" "}\n" "define void @b3() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @b1, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @b1, ptr %a\n" " ret void\n" "}\n" "define void @c1() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @c2, void ()** %a\n" - " store void ()* @d2, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @c2, ptr %a\n" + " store ptr @d2, ptr %a\n" " ret void\n" "}\n" "define void @c2() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @c3, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @c3, ptr %a\n" " ret void\n" "}\n" "define void @c3() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @c1, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @c1, ptr %a\n" " ret void\n" "}\n" "define void @d1() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @d2, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @d2, ptr %a\n" " ret void\n" "}\n" "define void @d2() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @d3, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @d3, ptr %a\n" " ret void\n" "}\n" "define void @d3() {\n" "entry:\n" - " %a = alloca void ()*\n" - " store void ()* @d1, void ()** %a\n" + " %a = alloca ptr\n" + " store ptr @d1, ptr %a\n" " ret void\n" "}\n"; @@ -1005,20 +1005,20 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeRefCycle) { std::unique_ptr M = parseAssembly(Context, "define void @a() {\n" "entry:\n" - " %p = alloca void ()*\n" - " store void ()* @b, void ()** %p\n" + " %p = alloca ptr\n" + " store ptr @b, ptr %p\n" " ret void\n" "}\n" "define void @b() {\n" "entry:\n" - " %p = alloca void ()*\n" - " store void ()* @c, void ()** %p\n" + " %p = alloca ptr\n" + " store ptr @c, ptr %p\n" " ret void\n" "}\n" "define void @c() {\n" "entry:\n" - " %p = alloca void ()*\n" - " store void ()* @d, void ()** %p\n" + " %p = alloca ptr\n" + " store ptr @d, ptr %p\n" " ret void\n" "}\n" "define void @d() {\n" @@ -1306,25 +1306,25 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) { LLVMContext Context; // A nice fully connected (including self-edges) RefSCC. std::unique_ptr M = parseAssembly( - Context, "define void @a(i8** %ptr) {\n" + Context, "define void @a(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" + " store ptr @a, ptr %ptr\n" + " store ptr @b, ptr %ptr\n" + " store ptr @c, ptr %ptr\n" " ret void\n" "}\n" - "define void @b(i8** %ptr) {\n" + "define void @b(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" + " store ptr @a, ptr %ptr\n" + " store ptr @b, ptr %ptr\n" + " store ptr @c, ptr %ptr\n" " ret void\n" "}\n" - "define void @c(i8** %ptr) {\n" + "define void @c(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" + " store ptr @a, ptr %ptr\n" + " store ptr @b, ptr %ptr\n" + " store ptr @c, ptr %ptr\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -1384,25 +1384,25 @@ TEST(LazyCallGraphTest, InternalMultiEdgeRemoval) { LLVMContext Context; // A nice fully connected (including self-edges) RefSCC. std::unique_ptr M = parseAssembly( - Context, "define void @a(i8** %ptr) {\n" + Context, "define void @a(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" + " store ptr @a, ptr %ptr\n" + " store ptr @b, ptr %ptr\n" + " store ptr @c, ptr %ptr\n" " ret void\n" "}\n" - "define void @b(i8** %ptr) {\n" + "define void @b(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" + " store ptr @a, ptr %ptr\n" + " store ptr @b, ptr %ptr\n" + " store ptr @c, ptr %ptr\n" " ret void\n" "}\n" - "define void @c(i8** %ptr) {\n" + "define void @c(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" + " store ptr @a, ptr %ptr\n" + " store ptr @b, ptr %ptr\n" + " store ptr @c, ptr %ptr\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -1454,22 +1454,22 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) { // Reference edges: a -> b -> c -> a // Call edges: a -> c -> b -> a std::unique_ptr M = parseAssembly( - Context, "define void @a(i8** %ptr) {\n" + Context, "define void @a(ptr %ptr) {\n" "entry:\n" - " call void @b(i8** %ptr)\n" - " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" + " call void @b(ptr %ptr)\n" + " store ptr @c, ptr %ptr\n" " ret void\n" "}\n" - "define void @b(i8** %ptr) {\n" + "define void @b(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @a to i8*), i8** %ptr\n" - " call void @c(i8** %ptr)\n" + " store ptr @a, ptr %ptr\n" + " call void @c(ptr %ptr)\n" " ret void\n" "}\n" - "define void @c(i8** %ptr) {\n" + "define void @c(ptr %ptr) {\n" "entry:\n" - " call void @a(i8** %ptr)\n" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" + " call void @a(ptr %ptr)\n" + " store ptr @b, ptr %ptr\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -1622,24 +1622,24 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCall) { "entry:\n" " call void @b()\n" " call void @c()\n" - " store void()* @d, void()** undef\n" + " store ptr @d, ptr undef\n" " ret void\n" "}\n" "define void @b() {\n" "entry:\n" - " store void()* @c, void()** undef\n" + " store ptr @c, ptr undef\n" " call void @d()\n" " ret void\n" "}\n" "define void @c() {\n" "entry:\n" - " store void()* @b, void()** undef\n" + " store ptr @b, ptr undef\n" " call void @d()\n" " ret void\n" "}\n" "define void @d() {\n" "entry:\n" - " store void()* @a, void()** undef\n" + " store ptr @a, ptr undef\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -1745,13 +1745,13 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallNoCycleInterleaved) { "}\n" "define void @c3() {\n" "entry:\n" - " store void()* @b1, void()** undef\n" + " store ptr @b1, ptr undef\n" " call void @d()\n" " ret void\n" "}\n" "define void @d() {\n" "entry:\n" - " store void()* @a, void()** undef\n" + " store ptr @a, ptr undef\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -1875,13 +1875,13 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallBothPartitionAndMerge) { "}\n" "define void @f() {\n" "entry:\n" - " store void()* @b, void()** undef\n" + " store ptr @b, ptr undef\n" " call void @g()\n" " ret void\n" "}\n" "define void @g() {\n" "entry:\n" - " store void()* @a, void()** undef\n" + " store ptr @a, ptr undef\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -1962,9 +1962,9 @@ TEST(LazyCallGraphTest, HandleBlockAddress) { "bb:\n" " unreachable\n" "}\n" - "define void @g(i8** %ptr) {\n" + "define void @g(ptr %ptr) {\n" "entry:\n" - " store i8* blockaddress(@f, %bb), i8** %ptr\n" + " store ptr blockaddress(@f, %bb), ptr %ptr\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -1991,9 +1991,9 @@ TEST(LazyCallGraphTest, HandleBlockAddress2) { parseAssembly(Context, "define void @f() {\n" " ret void\n" "}\n" - "define void @g(i8** %ptr) {\n" + "define void @g(ptr %ptr) {\n" "bb:\n" - " store i8* blockaddress(@g, %bb), i8** %ptr\n" + " store ptr blockaddress(@g, %bb), ptr %ptr\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -2018,31 +2018,31 @@ TEST(LazyCallGraphTest, ReplaceNodeFunction) { // function. std::unique_ptr M = parseAssembly(Context, - "define void @a(i8** %ptr) {\n" + "define void @a(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n" + " store ptr @d, ptr %ptr\n" " ret void\n" "}\n" - "define void @b(i8** %ptr) {\n" + "define void @b(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n" - " store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n" - " call void @d(i8** %ptr)" + " store ptr @d, ptr %ptr\n" + " store ptr @d, ptr %ptr\n" + " call void @d(ptr %ptr)" " ret void\n" "}\n" - "define void @c(i8** %ptr) {\n" + "define void @c(ptr %ptr) {\n" "entry:\n" - " call void @d(i8** %ptr)" - " call void @d(i8** %ptr)" - " store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n" + " call void @d(ptr %ptr)" + " call void @d(ptr %ptr)" + " store ptr @d, ptr %ptr\n" " ret void\n" "}\n" - "define void @d(i8** %ptr) {\n" + "define void @d(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" - " call void @c(i8** %ptr)" - " call void @d(i8** %ptr)" - " store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n" + " store ptr @b, ptr %ptr\n" + " call void @c(ptr %ptr)" + " call void @d(ptr %ptr)" + " store ptr @d, ptr %ptr\n" " ret void\n" "}\n"); LazyCallGraph CG = buildCG(*M); @@ -2098,25 +2098,25 @@ TEST(LazyCallGraphTest, RemoveFunctionWithSpuriousRef) { // A graph with a couple of RefSCCs. std::unique_ptr M = parseAssembly(Context, - "define void @a(i8** %ptr) {\n" + "define void @a(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n" + " store ptr @d, ptr %ptr\n" " ret void\n" "}\n" - "define void @b(i8** %ptr) {\n" + "define void @b(ptr %ptr) {\n" "entry:\n" - " store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n" + " store ptr @c, ptr %ptr\n" " ret void\n" "}\n" - "define void @c(i8** %ptr) {\n" + "define void @c(ptr %ptr) {\n" "entry:\n" - " call void @d(i8** %ptr)" + " call void @d(ptr %ptr)" " ret void\n" "}\n" - "define void @d(i8** %ptr) {\n" + "define void @d(ptr %ptr) {\n" "entry:\n" - " call void @c(i8** %ptr)" - " store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n" + " call void @c(ptr %ptr)" + " store ptr @b, ptr %ptr\n" " ret void\n" "}\n" "define void @dead() {\n" @@ -2965,7 +2965,7 @@ TEST(LazyCallGraphTest, AddSplitFunctions5) { LLVMContext Context; std::unique_ptr M = parseAssembly(Context, "define void @f() {\n" - " %1 = bitcast void ()* @f2 to i8*\n" + " %1 = bitcast ptr @f2 to ptr\n" " ret void\n" "}\n" "define void @f2() {\n" diff --git a/llvm/unittests/Analysis/SparsePropagation.cpp b/llvm/unittests/Analysis/SparsePropagation.cpp index ca73a480cbb2d..0cbf5de81c808 100644 --- a/llvm/unittests/Analysis/SparsePropagation.cpp +++ b/llvm/unittests/Analysis/SparsePropagation.cpp @@ -357,9 +357,9 @@ TEST_F(SparsePropagationTest, GlobalVariableOverDefined) { /// Test that we propagate information through function returns. /// -/// define internal i64 @f(i1* %cond) { +/// define internal i64 @f(ptr %cond) { /// if: -/// %0 = load i1, i1* %cond +/// %0 = load i1, ptr %cond /// br i1 %0, label %then, label %else /// /// then: @@ -397,9 +397,9 @@ TEST_F(SparsePropagationTest, FunctionDefined) { /// Test that we propagate information through function returns. /// -/// define internal i64 @f(i1* %cond) { +/// define internal i64 @f(ptr %cond) { /// if: -/// %0 = load i1, i1* %cond +/// %0 = load i1, ptr %cond /// br i1 %0, label %then, label %else /// /// then: diff --git a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp index d5ba1757ce35c..3c7ee7ad1334e 100644 --- a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp +++ b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp @@ -214,18 +214,18 @@ TEST(UnrollAnalyzerTest, PtrCmpSimplifications) { "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n" "define void @ptr_cmp(i8 *%a) {\n" "entry:\n" - " %limit = getelementptr i8, i8* %a, i64 40\n" - " %start.iv2 = getelementptr i8, i8* %a, i64 7\n" + " %limit = getelementptr i8, ptr %a, i64 40\n" + " %start.iv2 = getelementptr i8, ptr %a, i64 7\n" " br label %loop.body\n" "loop.body:\n" - " %iv.0 = phi i8* [ %a, %entry ], [ %iv.1, %loop.body ]\n" - " %iv2.0 = phi i8* [ %start.iv2, %entry ], [ %iv2.1, %loop.body ]\n" - " %cmp = icmp eq i8* %iv2.0, %iv.0\n" - " %cmp2 = icmp slt i8* %iv2.0, %iv.0\n" - " %cmp3 = icmp ult i8* %iv2.0, %iv.0\n" - " %iv.1 = getelementptr inbounds i8, i8* %iv.0, i64 1\n" - " %iv2.1 = getelementptr inbounds i8, i8* %iv2.0, i64 1\n" - " %exitcond = icmp ne i8* %iv.1, %limit\n" + " %iv.0 = phi ptr [ %a, %entry ], [ %iv.1, %loop.body ]\n" + " %iv2.0 = phi ptr [ %start.iv2, %entry ], [ %iv2.1, %loop.body ]\n" + " %cmp = icmp eq ptr %iv2.0, %iv.0\n" + " %cmp2 = icmp slt ptr %iv2.0, %iv.0\n" + " %cmp3 = icmp ult ptr %iv2.0, %iv.0\n" + " %iv.1 = getelementptr inbounds i8, ptr %iv.0, i64 1\n" + " %iv2.1 = getelementptr inbounds i8, ptr %iv2.0, i64 1\n" + " %exitcond = icmp ne ptr %iv.1, %limit\n" " br i1 %exitcond, label %loop.body, label %loop.exit\n" "loop.exit:\n" " ret void\n" @@ -248,14 +248,14 @@ TEST(UnrollAnalyzerTest, PtrCmpSimplifications) { Instruction *Cmp2 = &*BBI++; Instruction *Cmp3 = &*BBI++; // Check simplification expected on the 5th iteration. - // Check that "%cmp = icmp eq i8* %iv2.0, %iv.0" is simplified to 0. + // Check that "%cmp = icmp eq ptr %iv2.0, %iv.0" is simplified to 0. auto I1 = SimplifiedValuesVector[5].find(Cmp1); EXPECT_TRUE(I1 != SimplifiedValuesVector[5].end()); EXPECT_EQ(cast((*I1).second)->getZExtValue(), 0U); - // Check that "%cmp2 = icmp slt i8* %iv2.0, %iv.0" does not simplify + // Check that "%cmp2 = icmp slt ptr %iv2.0, %iv.0" does not simplify auto I2 = SimplifiedValuesVector[5].find(Cmp2); EXPECT_TRUE(I2 == SimplifiedValuesVector[5].end()); - // Check that "%cmp3 = icmp ult i8* %iv2.0, %iv.0" is simplified to 0. + // Check that "%cmp3 = icmp ult ptr %iv2.0, %iv.0" is simplified to 0. auto I3 = SimplifiedValuesVector[5].find(Cmp3); EXPECT_TRUE(I3 != SimplifiedValuesVector[5].end()); EXPECT_EQ(cast((*I1).second)->getZExtValue(), 0U); @@ -271,8 +271,8 @@ TEST(UnrollAnalyzerTest, CastSimplifications) { "\n" "loop:\n" " %iv = phi i64 [ 0, %entry ], [ %inc, %loop ]\n" - " %array_const_idx = getelementptr inbounds [10 x i32], [10 x i32]* @known_constant, i64 0, i64 %iv\n" - " %const_array_element = load i32, i32* %array_const_idx, align 4\n" + " %array_const_idx = getelementptr inbounds [10 x i32], ptr @known_constant, i64 0, i64 %iv\n" + " %const_array_element = load i32, ptr %array_const_idx, align 4\n" " %se = sext i32 %const_array_element to i64\n" " %ze = zext i32 %const_array_element to i64\n" " %tr = trunc i32 %const_array_element to i8\n" diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp index bb0280ee69cfd..89a0faefc5988 100644 --- a/llvm/unittests/Analysis/ValueTrackingTest.cpp +++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp @@ -2697,9 +2697,9 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPWithRange) { parseAssembly( "define void @test(ptr %p) {\n" " %A = load i64, ptr %p, !range !{i64 64, i64 65536}\n" - " %APtr = inttoptr i64 %A to float*" - " %APtrPlus512 = getelementptr float, float* %APtr, i32 128\n" - " %c = icmp ugt float* %APtrPlus512, inttoptr (i32 523 to float*)\n" + " %APtr = inttoptr i64 %A to ptr" + " %APtrPlus512 = getelementptr float, ptr %APtr, i32 128\n" + " %c = icmp ugt ptr %APtrPlus512, inttoptr (i32 523 to ptr)\n" " call void @llvm.assume(i1 %c)\n" " ret void\n" "}\n" @@ -2730,9 +2730,9 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsGEPWithRangeNoOverlap) { parseAssembly( "define void @test(ptr %p) {\n" " %A = load i64, ptr %p, !range !{i64 32, i64 64}\n" - " %APtr = inttoptr i64 %A to float*" - " %APtrPlus512 = getelementptr float, float* %APtr, i32 128\n" - " %c = icmp ugt float* %APtrPlus512, inttoptr (i32 523 to float*)\n" + " %APtr = inttoptr i64 %A to ptr" + " %APtrPlus512 = getelementptr float, ptr %APtr, i32 128\n" + " %c = icmp ugt ptr %APtrPlus512, inttoptr (i32 523 to ptr)\n" " call void @llvm.assume(i1 %c)\n" " ret void\n" "}\n" diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc index 0180ba0a6c163..783267aa05b59 100644 --- a/llvm/unittests/CodeGen/MFCommon.inc +++ b/llvm/unittests/CodeGen/MFCommon.inc @@ -76,8 +76,10 @@ public: }; class BogusTargetInstrInfo : public TargetInstrInfo { + BogusRegisterInfo RegInfo; + public: - BogusTargetInstrInfo() : TargetInstrInfo() {} + BogusTargetInstrInfo() : TargetInstrInfo(RegInfo) {} }; class BogusSubtarget : public TargetSubtargetInfo { diff --git a/llvm/unittests/MIR/MachineMetadata.cpp b/llvm/unittests/MIR/MachineMetadata.cpp index 587551246c4f4..8c3637704fc71 100644 --- a/llvm/unittests/MIR/MachineMetadata.cpp +++ b/llvm/unittests/MIR/MachineMetadata.cpp @@ -205,8 +205,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerAArch64) { StringRef MIRString = R"MIR( --- | - define i32 @test0(i32* %p) { - %r = load i32, i32* %p, align 4 + define i32 @test0(ptr %p) { + %r = load i32, ptr %p, align 4 ret i32 %r } ... @@ -354,8 +354,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerX64) { StringRef MIRString = R"MIR( --- | - define i32 @test0(i32* %p) { - %r = load i32, i32* %p, align 4 + define i32 @test0(ptr %p) { + %r = load i32, ptr %p, align 4 ret i32 %r } ... @@ -446,8 +446,8 @@ TEST_F(MachineMetadataTest, MMSlotTrackerAMDGPU) { StringRef MIRString = R"MIR( --- | - define i32 @test0(i32* %p) { - %r = load i32, i32* %p, align 4 + define i32 @test0(ptr %p) { + %r = load i32, ptr %p, align 4 ret i32 %r } ... diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp index d7f2908bb079b..c36ed93a20cef 100644 --- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp +++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp @@ -75,7 +75,7 @@ TEST(WebAssemblyExceptionInfoTest, TEST0) { declare i32 @__gxx_wasm_personality_v0(...) - define void @test0() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) { + define void @test0() personality ptr @__gxx_wasm_personality_v0 { unreachable } @@ -237,7 +237,7 @@ TEST(WebAssemblyExceptionInfoTest, TEST1) { declare i32 @__gxx_wasm_personality_v0(...) - define void @test1() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) { + define void @test1() personality ptr @__gxx_wasm_personality_v0 { unreachable } diff --git a/llvm/unittests/Transforms/IPO/AttributorTest.cpp b/llvm/unittests/Transforms/IPO/AttributorTest.cpp index e442dae9aa3ad..e345c60f781d2 100644 --- a/llvm/unittests/Transforms/IPO/AttributorTest.cpp +++ b/llvm/unittests/Transforms/IPO/AttributorTest.cpp @@ -78,17 +78,17 @@ TEST_F(AttributorTestBase, AAReachabilityTest) { const char *ModuleString = R"( @x = external global i32 define void @func4() { - store i32 0, i32* @x + store i32 0, ptr @x ret void } define internal void @func3() { - store i32 0, i32* @x + store i32 0, ptr @x ret void } define internal void @func8() { - store i32 0, i32* @x + store i32 0, ptr @x ret void } @@ -105,7 +105,7 @@ TEST_F(AttributorTestBase, AAReachabilityTest) { } declare void @unknown() - define internal void @func5(void ()* %ptr) { + define internal void @func5(ptr %ptr) { entry: call void %ptr() call void @unknown() @@ -114,8 +114,8 @@ TEST_F(AttributorTestBase, AAReachabilityTest) { define void @func6() { entry: - store i32 0, i32* @x - call void @func5(void ()* @func3) + store i32 0, ptr @x + call void @func5(ptr @func3) ret void } diff --git a/llvm/unittests/Transforms/Scalar/LICMTest.cpp b/llvm/unittests/Transforms/Scalar/LICMTest.cpp index 98a69bbb47de1..a193993ba04d6 100644 --- a/llvm/unittests/Transforms/Scalar/LICMTest.cpp +++ b/llvm/unittests/Transforms/Scalar/LICMTest.cpp @@ -37,13 +37,13 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) { SMDiagnostic Error; StringRef Text = R"( - define void @foo(i64* %ptr) { + define void @foo(ptr %ptr) { entry: br label %loop loop: %iv = phi i64 [ 0, %entry ], [ %iv.inc, %loop ] - %n = load i64, i64* %ptr, !invariant.load !0 + %n = load i64, ptr %ptr, !invariant.load !0 %iv.inc = add i64 %iv, 1 %cmp = icmp ult i64 %iv.inc, %n br i1 %cmp, label %loop, label %exit @@ -62,17 +62,17 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) { BasicBlock &EntryBB = F->getEntryBlock(); BasicBlock *LoopBB = EntryBB.getUniqueSuccessor(); - // Select `load i64, i64* %ptr`. + // Select `load i64, ptr %ptr`. Instruction *IBefore = &*LoopBB->getFirstNonPHIIt(); // Make sure the right instruction was selected. ASSERT_TRUE(isa(IBefore)); - // Upon this query SCEV caches disposition of SCEV. + // Upon this query SCEV caches disposition of SCEV. ASSERT_EQ(SE.getBlockDisposition(SE.getSCEV(IBefore), LoopBB), ScalarEvolution::BlockDisposition::DominatesBlock); MPM.run(*M, MAM); - // Select `load i64, i64* %ptr` after it was hoisted. + // Select `load i64, ptr %ptr` after it was hoisted. Instruction *IAfter = &*EntryBB.getFirstNonPHIIt(); // Make sure the right instruction was selected. ASSERT_TRUE(isa(IAfter)); @@ -84,7 +84,7 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) { SE.getBlockDisposition(SE.getSCEV(IAfter), LoopBB); // If LICM have properly invalidated SCEV, - // 1. SCEV of should properly dominate the "loop" BB, + // 1. SCEV of should properly dominate the "loop" BB, // 2. extra invalidation shouldn't change result of the query. EXPECT_EQ(DispositionBeforeInvalidation, ScalarEvolution::BlockDisposition::ProperlyDominatesBlock); diff --git a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp index cb3d1001e4110..88eaa875a803a 100644 --- a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp +++ b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp @@ -265,21 +265,21 @@ class LoopPassManagerTest : public ::testing::Test { public: LoopPassManagerTest() : M(parseIR(Context, - "define void @f(i1* %ptr) {\n" + "define void @f(ptr %ptr) {\n" "entry:\n" " br label %loop.0\n" "loop.0:\n" - " %cond.0 = load volatile i1, i1* %ptr\n" + " %cond.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0, label %loop.0.0.ph, label %end\n" "loop.0.0.ph:\n" " br label %loop.0.0\n" "loop.0.0:\n" - " %cond.0.0 = load volatile i1, i1* %ptr\n" + " %cond.0.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n" "loop.0.1.ph:\n" " br label %loop.0.1\n" "loop.0.1:\n" - " %cond.0.1 = load volatile i1, i1* %ptr\n" + " %cond.0.1 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.1, label %loop.0.1, label %loop.0.latch\n" "loop.0.latch:\n" " br label %loop.0\n" @@ -287,11 +287,11 @@ class LoopPassManagerTest : public ::testing::Test { " ret void\n" "}\n" "\n" - "define void @g(i1* %ptr) {\n" + "define void @g(ptr %ptr) {\n" "entry:\n" " br label %loop.g.0\n" "loop.g.0:\n" - " %cond.0 = load volatile i1, i1* %ptr\n" + " %cond.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0, label %loop.g.0, label %end\n" "end:\n" " ret void\n" @@ -861,26 +861,26 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) { TEST_F(LoopPassManagerTest, LoopChildInsertion) { // Super boring module with three loops in a single loop nest. - M = parseIR(Context, "define void @f(i1* %ptr) {\n" + M = parseIR(Context, "define void @f(ptr %ptr) {\n" "entry:\n" " br label %loop.0\n" "loop.0:\n" - " %cond.0 = load volatile i1, i1* %ptr\n" + " %cond.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0, label %loop.0.0.ph, label %end\n" "loop.0.0.ph:\n" " br label %loop.0.0\n" "loop.0.0:\n" - " %cond.0.0 = load volatile i1, i1* %ptr\n" + " %cond.0.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n" "loop.0.1.ph:\n" " br label %loop.0.1\n" "loop.0.1:\n" - " %cond.0.1 = load volatile i1, i1* %ptr\n" + " %cond.0.1 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.1, label %loop.0.1, label %loop.0.2.ph\n" "loop.0.2.ph:\n" " br label %loop.0.2\n" "loop.0.2:\n" - " %cond.0.2 = load volatile i1, i1* %ptr\n" + " %cond.0.2 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.2, label %loop.0.2, label %loop.0.latch\n" "loop.0.latch:\n" " br label %loop.0\n" @@ -1064,28 +1064,28 @@ TEST_F(LoopPassManagerTest, LoopChildInsertion) { TEST_F(LoopPassManagerTest, LoopPeerInsertion) { // Super boring module with two loop nests and loop nest with two child // loops. - M = parseIR(Context, "define void @f(i1* %ptr) {\n" + M = parseIR(Context, "define void @f(ptr %ptr) {\n" "entry:\n" " br label %loop.0\n" "loop.0:\n" - " %cond.0 = load volatile i1, i1* %ptr\n" + " %cond.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0, label %loop.0.0.ph, label %loop.2.ph\n" "loop.0.0.ph:\n" " br label %loop.0.0\n" "loop.0.0:\n" - " %cond.0.0 = load volatile i1, i1* %ptr\n" + " %cond.0.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.0, label %loop.0.0, label %loop.0.2.ph\n" "loop.0.2.ph:\n" " br label %loop.0.2\n" "loop.0.2:\n" - " %cond.0.2 = load volatile i1, i1* %ptr\n" + " %cond.0.2 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.2, label %loop.0.2, label %loop.0.latch\n" "loop.0.latch:\n" " br label %loop.0\n" "loop.2.ph:\n" " br label %loop.2\n" "loop.2:\n" - " %cond.2 = load volatile i1, i1* %ptr\n" + " %cond.2 = load volatile i1, ptr %ptr\n" " br i1 %cond.2, label %loop.2, label %end\n" "end:\n" " ret void\n" @@ -1318,31 +1318,31 @@ TEST_F(LoopPassManagerTest, LoopDeletion) { // Build a module with a single loop nest that contains one outer loop with // three subloops, and one of those with its own subloop. We will // incrementally delete all of these to test different deletion scenarios. - M = parseIR(Context, "define void @f(i1* %ptr) {\n" + M = parseIR(Context, "define void @f(ptr %ptr) {\n" "entry:\n" " br label %loop.0\n" "loop.0:\n" - " %cond.0 = load volatile i1, i1* %ptr\n" + " %cond.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0, label %loop.0.0.ph, label %end\n" "loop.0.0.ph:\n" " br label %loop.0.0\n" "loop.0.0:\n" - " %cond.0.0 = load volatile i1, i1* %ptr\n" + " %cond.0.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.0, label %loop.0.0, label %loop.0.1.ph\n" "loop.0.1.ph:\n" " br label %loop.0.1\n" "loop.0.1:\n" - " %cond.0.1 = load volatile i1, i1* %ptr\n" + " %cond.0.1 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.1, label %loop.0.1, label %loop.0.2.ph\n" "loop.0.2.ph:\n" " br label %loop.0.2\n" "loop.0.2:\n" - " %cond.0.2 = load volatile i1, i1* %ptr\n" + " %cond.0.2 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.2, label %loop.0.2.0.ph, label %loop.0.latch\n" "loop.0.2.0.ph:\n" " br label %loop.0.2.0\n" "loop.0.2.0:\n" - " %cond.0.2.0 = load volatile i1, i1* %ptr\n" + " %cond.0.2.0 = load volatile i1, ptr %ptr\n" " br i1 %cond.0.2.0, label %loop.0.2.0, label %loop.0.2.latch\n" "loop.0.2.latch:\n" " br label %loop.0.2\n" diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp index 4235c93f275f0..00d9e9ff81e05 100644 --- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp +++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp @@ -484,9 +484,9 @@ while.body: TEST(BasicBlockUtils, SplitIndirectBrCriticalEdgesIgnorePHIs) { LLVMContext C; std::unique_ptr M = parseIR(C, R"IR( -define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) { +define void @crit_edge(ptr %tgt, i1 %cond0, i1 %cond1) { entry: - indirectbr i8* %tgt, [label %bb0, label %bb1, label %bb2] + indirectbr ptr %tgt, [label %bb0, label %bb1, label %bb2] bb0: br i1 %cond0, label %bb1, label %bb2 bb1: @@ -526,9 +526,9 @@ define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) { TEST(BasicBlockUtils, SplitIndirectBrCriticalEdges) { LLVMContext C; std::unique_ptr M = parseIR(C, R"IR( -define void @crit_edge(i8* %tgt, i1 %cond0, i1 %cond1) { +define void @crit_edge(ptr %tgt, i1 %cond0, i1 %cond1) { entry: - indirectbr i8* %tgt, [label %bb0, label %bb1, label %bb2] + indirectbr ptr %tgt, [label %bb0, label %bb1, label %bb2] bb0: br i1 %cond0, label %bb1, label %bb2 bb1: diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp index d990808d31fe2..237bc6e873f94 100644 --- a/llvm/unittests/Transforms/Utils/CloningTest.cpp +++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp @@ -394,7 +394,7 @@ TEST(CloneLoop, CloneLoopNest) { std::unique_ptr M = parseIR( Context, - R"(define void @foo(i32* %A, i32 %ub) { + R"(define void @foo(ptr %A, i32 %ub) { entry: %guardcmp = icmp slt i32 0, %ub br i1 %guardcmp, label %for.outer.preheader, label %for.end @@ -408,8 +408,8 @@ for.inner.preheader: for.inner: %i = phi i32 [ 0, %for.inner.preheader ], [ %inc, %for.inner ] %idxprom = sext i32 %i to i64 - %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom - store i32 %i, i32* %arrayidx, align 4 + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %idxprom + store i32 %i, ptr %arrayidx, align 4 %inc = add nsw i32 %i, 1 %cmp = icmp slt i32 %inc, %ub br i1 %cmp, label %for.inner, label %for.inner.exit @@ -728,10 +728,10 @@ TEST(CloneFunction, CloneEmptyFunction) { TEST(CloneFunction, CloneFunctionWithInalloca) { StringRef ImplAssembly = R"( - declare void @a(i32* inalloca(i32)) + declare void @a(ptr inalloca(i32)) define void @foo() { %a = alloca inalloca i32 - call void @a(i32* inalloca(i32) %a) + call void @a(ptr inalloca(i32) %a) ret void } declare void @bar() diff --git a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp index 9ea8de3da1e5b..90f06204ec9b3 100644 --- a/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp +++ b/llvm/unittests/Transforms/Utils/CodeExtractorTest.cpp @@ -154,13 +154,13 @@ TEST(CodeExtractor, ExitBlockOrderingPhis) { %0 = alloca i32, align 4 br label %test0 test0: - %c = load i32, i32* %0, align 4 + %c = load i32, ptr %0, align 4 br label %test1 test1: - %e = load i32, i32* %0, align 4 + %e = load i32, ptr %0, align 4 br i1 true, label %first, label %test test: - %d = load i32, i32* %0, align 4 + %d = load i32, ptr %0, align 4 br i1 true, label %first, label %next first: %1 = phi i32 [ %c, %test ], [ %e, %test1 ] @@ -212,13 +212,13 @@ TEST(CodeExtractor, ExitBlockOrdering) { %0 = alloca i32, align 4 br label %test0 test0: - %c = load i32, i32* %0, align 4 + %c = load i32, ptr %0, align 4 br label %test1 test1: - %e = load i32, i32* %0, align 4 + %e = load i32, ptr %0, align 4 br i1 true, label %first, label %test test: - %d = load i32, i32* %0, align 4 + %d = load i32, ptr %0, align 4 br i1 true, label %first, label %next first: ret void @@ -317,7 +317,7 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) { std::unique_ptr M(parseAssemblyString(R"invalid( declare i8 @hoge() - define i32 @foo() personality i8* null { + define i32 @foo() personality ptr null { entry: %call = invoke i8 @hoge() to label %invoke.cont unwind label %lpad @@ -326,8 +326,8 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) { unreachable lpad: ; preds = %entry - %0 = landingpad { i8*, i32 } - catch i8* null + %0 = landingpad { ptr, i32 } + catch ptr null br i1 undef, label %catch, label %finally.catchall catch: ; preds = %lpad @@ -342,13 +342,13 @@ TEST(CodeExtractor, StoreOutputInvokeResultAfterEHPad) { unreachable lpad2: ; preds = %invoke.cont2, %catch - %ex.1 = phi i8* [ undef, %invoke.cont2 ], [ null, %catch ] - %1 = landingpad { i8*, i32 } - catch i8* null + %ex.1 = phi ptr [ undef, %invoke.cont2 ], [ null, %catch ] + %1 = landingpad { ptr, i32 } + catch ptr null br label %finally.catchall finally.catchall: ; preds = %lpad33, %lpad - %ex.2 = phi i8* [ %ex.1, %lpad2 ], [ null, %lpad ] + %ex.2 = phi ptr [ %ex.1, %lpad2 ], [ null, %lpad ] unreachable } )invalid", Err, Ctx)); @@ -384,7 +384,7 @@ TEST(CodeExtractor, StoreOutputInvokeResultInExitStub) { std::unique_ptr M(parseAssemblyString(R"invalid( declare i32 @bar() - define i32 @foo() personality i8* null { + define i32 @foo() personality ptr null { entry: %0 = invoke i32 @bar() to label %exit unwind label %lpad @@ -392,9 +392,9 @@ TEST(CodeExtractor, StoreOutputInvokeResultInExitStub) { ret i32 %0 lpad: - %1 = landingpad { i8*, i32 } + %1 = landingpad { ptr, i32 } cleanup - resume { i8*, i32 } %1 + resume { ptr, i32 } %1 } )invalid", Err, Ctx)); @@ -421,7 +421,7 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) { target triple = "aarch64" %b = type { i64 } - declare void @g(i8*) + declare void @g(ptr) declare void @llvm.assume(i1) #0 @@ -430,9 +430,9 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) { br label %label label: - %0 = load %b*, %b** inttoptr (i64 8 to %b**), align 8 - %1 = getelementptr inbounds %b, %b* %0, i64 undef, i32 0 - %2 = load i64, i64* %1, align 8 + %0 = load ptr, ptr inttoptr (i64 8 to ptr), align 8 + %1 = getelementptr inbounds %b, ptr %0, i64 undef, i32 0 + %2 = load i64, ptr %1, align 8 %3 = icmp ugt i64 %2, 1 br i1 %3, label %if.then, label %if.else @@ -440,8 +440,8 @@ TEST(CodeExtractor, ExtractAndInvalidateAssumptionCache) { unreachable if.else: - call void @g(i8* undef) - store i64 undef, i64* null, align 536870912 + call void @g(ptr undef) + store i64 undef, ptr null, align 536870912 %4 = icmp eq i64 %2, 0 call void @llvm.assume(i1 %4) unreachable @@ -473,9 +473,9 @@ TEST(CodeExtractor, RemoveBitcastUsesFromOuterLifetimeMarkers) { target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" - declare void @use(i32*) - declare void @llvm.lifetime.start.p0i8(i64, i8*) - declare void @llvm.lifetime.end.p0i8(i64, i8*) + declare void @use(ptr) + declare void @llvm.lifetime.start.p0i8(i64, ptr) + declare void @llvm.lifetime.end.p0i8(i64, ptr) define void @foo() { entry: @@ -483,14 +483,14 @@ TEST(CodeExtractor, RemoveBitcastUsesFromOuterLifetimeMarkers) { br label %extract extract: - %1 = bitcast i32* %0 to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) - call void @use(i32* %0) + %1 = bitcast ptr %0 to ptr + call void @llvm.lifetime.start.p0i8(i64 4, ptr %1) + call void @use(ptr %0) br label %exit exit: - call void @use(i32* %0) - call void @llvm.lifetime.end.p0i8(i64 4, i8* %1) + call void @use(ptr %0) + call void @llvm.lifetime.end.p0i8(i64 4, ptr %1) ret void } )ir", diff --git a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp index 9466977d00649..191ccc3a9dbd9 100644 --- a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp +++ b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp @@ -75,21 +75,21 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentSimpleTest) { // i = 3; // } std::unique_ptr M = - parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2) { + parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2) { entry: br i1 %cond1, label %if.first, label %if.first.end if.first: - store i32 1, i32* %i, align 4 + store i32 1, ptr %i, align 4 br label %if.first.end if.first.end: br i1 %cond1, label %if.second, label %if.second.end if.second: - store i32 2, i32* %i, align 4 + store i32 2, ptr %i, align 4 br label %if.second.end if.second.end: br i1 %cond2, label %if.third, label %if.third.end if.third: - store i32 3, i32* %i, align 4 + store i32 3, ptr %i, align 4 br label %if.third.end if.third.end: ret void @@ -136,51 +136,51 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentOppositeCondTest) { // i = 9; // } std::unique_ptr M = - parseIR(C, R"(define void @foo(i32* %i, i32 %X, i32 %Y) { + parseIR(C, R"(define void @foo(ptr %i, i32 %X, i32 %Y) { entry: %cmp1 = icmp ult i32 %X, %Y br i1 %cmp1, label %if.first, label %if.first.end if.first: - store i32 1, i32* %i, align 4 + store i32 1, ptr %i, align 4 br label %if.first.end if.first.end: %cmp2 = icmp ugt i32 %Y, %X br i1 %cmp2, label %if.second, label %if.second.end if.second: - store i32 2, i32* %i, align 4 + store i32 2, ptr %i, align 4 br label %if.second.end if.second.end: %cmp3 = icmp uge i32 %X, %Y br i1 %cmp3, label %if.third, label %if.third.else if.third: - store i32 3, i32* %i, align 4 + store i32 3, ptr %i, align 4 br label %if.third.end if.third.else: - store i32 4, i32* %i, align 4 + store i32 4, ptr %i, align 4 br label %if.third.end if.third.end: %cmp4 = icmp eq i32 %X, %Y br i1 %cmp4, label %if.fourth, label %if.fourth.end if.fourth: - store i32 5, i32* %i, align 4 + store i32 5, ptr %i, align 4 br label %if.fourth.end if.fourth.end: %cmp5 = icmp eq i32 %Y, %X br i1 %cmp5, label %if.fifth, label %if.fifth.else if.fifth: - store i32 6, i32* %i, align 4 + store i32 6, ptr %i, align 4 br label %if.fifth.end if.fifth.else: - store i32 7, i32* %i, align 4 + store i32 7, ptr %i, align 4 br label %if.fifth.end if.fifth.end: %cmp6 = icmp ne i32 %X, %Y br i1 %cmp6, label %if.sixth, label %if.sixth.else if.sixth: - store i32 8, i32* %i, align 4 + store i32 8, ptr %i, align 4 br label %if.sixth.end if.sixth.else: - store i32 9, i32* %i, align 4 + store i32 9, ptr %i, align 4 br label %if.sixth.end if.sixth.end: ret void @@ -227,20 +227,20 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentCondNestTest) { // i = 2; // } std::unique_ptr M = - parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2) { + parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2) { entry: br i1 %cond1, label %if.outer.first, label %if.first.end if.outer.first: br i1 %cond2, label %if.inner.first, label %if.first.end if.inner.first: - store i32 1, i32* %i, align 4 + store i32 1, ptr %i, align 4 br label %if.first.end if.first.end: br i1 %cond2, label %if.outer.second, label %if.second.end if.outer.second: br i1 %cond1, label %if.inner.second, label %if.second.end if.inner.second: - store i32 2, i32* %i, align 4 + store i32 2, ptr %i, align 4 br label %if.second.end if.second.end: ret void @@ -283,7 +283,7 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentImbalanceTest) { // i = 4; // } std::unique_ptr M = - parseIR(C, R"(define void @foo(i32* %i, i1 %cond1, i1 %cond2, i1 %cond3) { + parseIR(C, R"(define void @foo(ptr %i, i1 %cond1, i1 %cond2, i1 %cond3) { entry: br i1 %cond1, label %if.outer.first, label %if.first.end if.outer.first: @@ -291,26 +291,26 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentImbalanceTest) { if.middle.first: br i1 %cond3, label %if.inner.first, label %if.first.end if.inner.first: - store i32 1, i32* %i, align 4 + store i32 1, ptr %i, align 4 br label %if.first.end if.first.end: br i1 %cond2, label %if.outer.second, label %if.second.end if.outer.second: br i1 %cond3, label %if.inner.second, label %if.second.end if.inner.second: - store i32 2, i32* %i, align 4 + store i32 2, ptr %i, align 4 br label %if.second.end if.second.end: br i1 %cond1, label %if.outer.third, label %if.third.end if.outer.third: br i1 %cond1, label %if.inner.third, label %if.third.end if.inner.third: - store i32 3, i32* %i, align 4 + store i32 3, ptr %i, align 4 br label %if.third.end if.third.end: br i1 %cond1, label %if.fourth, label %if.fourth.end if.fourth: - store i32 4, i32* %i, align 4 + store i32 4, ptr %i, align 4 br label %if.fourth.end if.fourth.end: ret void @@ -343,28 +343,28 @@ TEST(CodeMoverUtils, IsControlFlowEquivalentPointerTest) { // i = 3; // } std::unique_ptr M = - parseIR(C, R"(define void @foo(i32* %i, i32* %cond) { + parseIR(C, R"(define void @foo(ptr %i, ptr %cond) { entry: - %0 = load i32, i32* %cond, align 4 + %0 = load i32, ptr %cond, align 4 %tobool1 = icmp ne i32 %0, 0 br i1 %tobool1, label %if.first, label %if.first.end if.first: - store i32 1, i32* %i, align 4 + store i32 1, ptr %i, align 4 br label %if.first.end if.first.end: - %1 = load i32, i32* %cond, align 4 + %1 = load i32, ptr %cond, align 4 %tobool2 = icmp ne i32 %1, 0 br i1 %tobool2, label %if.second, label %if.second.end if.second: - store i32 2, i32* %i, align 4 + store i32 2, ptr %i, align 4 br label %if.second.end if.second.end: - store i32 1, i32* %cond, align 4 - %2 = load i32, i32* %cond, align 4 + store i32 1, ptr %cond, align 4 + %2 = load i32, ptr %cond, align 4 %tobool3 = icmp ne i32 %2, 0 br i1 %tobool3, label %if.third, label %if.third.end if.third: - store i32 3, i32* %i, align 4 + store i32 3, ptr %i, align 4 br label %if.third.end if.third.end: ret void @@ -450,7 +450,7 @@ TEST(CodeMoverUtils, IsSafeToMoveTest1) { // } // } std::unique_ptr M = parseIR( - C, R"(define void @foo(i32* noalias %A, i32* noalias %B, i32* noalias %C + C, R"(define void @foo(ptr noalias %A, ptr noalias %B, ptr noalias %C , i64 %N) { entry: %X = sdiv i64 1, %N @@ -461,18 +461,18 @@ TEST(CodeMoverUtils, IsSafeToMoveTest1) { br i1 %cmp1, label %for.body, label %for.end for.body: %i = phi i64 [ 0, %entry ], [ %inc, %for.body ] - %arrayidx_A5 = getelementptr inbounds i32, i32* %A, i64 5 - store i32 5, i32* %arrayidx_A5, align 4 - %arrayidx_A = getelementptr inbounds i32, i32* %A, i64 %i - store i32 0, i32* %arrayidx_A, align 4 - %load1 = load i32, i32* %arrayidx_A, align 4 - %arrayidx_B = getelementptr inbounds i32, i32* %B, i64 %i - store i32 %load1, i32* %arrayidx_B, align 4 - %load2 = load i32, i32* %arrayidx_A, align 4 - %arrayidx_C = getelementptr inbounds i32, i32* %C, i64 %i - store i32 %load2, i32* %arrayidx_C, align 4 - %arrayidx_A6 = getelementptr inbounds i32, i32* %A, i64 6 - store i32 6, i32* %arrayidx_A6, align 4 + %arrayidx_A5 = getelementptr inbounds i32, ptr %A, i64 5 + store i32 5, ptr %arrayidx_A5, align 4 + %arrayidx_A = getelementptr inbounds i32, ptr %A, i64 %i + store i32 0, ptr %arrayidx_A, align 4 + %load1 = load i32, ptr %arrayidx_A, align 4 + %arrayidx_B = getelementptr inbounds i32, ptr %B, i64 %i + store i32 %load1, ptr %arrayidx_B, align 4 + %load2 = load i32, ptr %arrayidx_A, align 4 + %arrayidx_C = getelementptr inbounds i32, ptr %C, i64 %i + store i32 %load2, ptr %arrayidx_C, align 4 + %arrayidx_A6 = getelementptr inbounds i32, ptr %A, i64 6 + store i32 6, ptr %arrayidx_A6, align 4 %inc = add nsw i64 %i, 1 %cmp = icmp slt i64 %inc, %N br i1 %cmp, label %for.body, label %for.end @@ -686,19 +686,19 @@ TEST(CodeMoverUtils, IsSafeToMoveTest5) { LLVMContext C; std::unique_ptr M = - parseIR(C, R"(define void @dependence(i32* noalias %A, i32* noalias %B){ + parseIR(C, R"(define void @dependence(ptr noalias %A, ptr noalias %B){ entry: - store i32 0, i32* %A, align 4 ; storeA0 - store i32 2, i32* %A, align 4 ; storeA1 - %tmp0 = load i32, i32* %A, align 4 ; loadA0 - store i32 1, i32* %B, align 4 ; storeB0 - %tmp1 = load i32, i32* %A, align 4 ; loadA1 - store i32 2, i32* %A, align 4 ; storeA2 - store i32 4, i32* %B, align 4 ; StoreB1 - %tmp2 = load i32, i32* %A, align 4 ; loadA2 - %tmp3 = load i32, i32* %A, align 4 ; loadA3 - %tmp4 = load i32, i32* %B, align 4 ; loadB2 - %tmp5 = load i32, i32* %B, align 4 ; loadB3 + store i32 0, ptr %A, align 4 ; storeA0 + store i32 2, ptr %A, align 4 ; storeA1 + %tmp0 = load i32, ptr %A, align 4 ; loadA0 + store i32 1, ptr %B, align 4 ; storeB0 + %tmp1 = load i32, ptr %A, align 4 ; loadA1 + store i32 2, ptr %A, align 4 ; storeA2 + store i32 4, ptr %B, align 4 ; StoreB1 + %tmp2 = load i32, ptr %A, align 4 ; loadA2 + %tmp3 = load i32, ptr %A, align 4 ; loadA3 + %tmp4 = load i32, ptr %B, align 4 ; loadB2 + %tmp5 = load i32, ptr %B, align 4 ; loadB3 ret void })"); @@ -763,63 +763,63 @@ TEST(CodeMoverUtils, IsSafeToMoveTest6) { LLVMContext C; std::unique_ptr M = parseIR( - C, R"(define void @dependence(i1 %cond, i32* noalias %A, i32* noalias %B){ + C, R"(define void @dependence(i1 %cond, ptr noalias %A, ptr noalias %B){ entry: br i1 %cond, label %bb0, label %bb1 bb0: br label %bb1 bb1: - store i32 0, i32* %A, align 4 ; storeA0 + store i32 0, ptr %A, align 4 ; storeA0 br i1 %cond, label %bb2, label %bb3 bb2: br label %bb3 bb3: - store i32 2, i32* %A, align 4 ; storeA1 + store i32 2, ptr %A, align 4 ; storeA1 br i1 %cond, label %bb4, label %bb5 bb4: br label %bb5 bb5: - %tmp0 = load i32, i32* %A, align 4 ; loadA0 + %tmp0 = load i32, ptr %A, align 4 ; loadA0 br i1 %cond, label %bb6, label %bb7 bb6: br label %bb7 bb7: - store i32 1, i32* %B, align 4 ; storeB0 + store i32 1, ptr %B, align 4 ; storeB0 br i1 %cond, label %bb8, label %bb9 bb8: br label %bb9 bb9: - %tmp1 = load i32, i32* %A, align 4 ; loadA1 + %tmp1 = load i32, ptr %A, align 4 ; loadA1 br i1 %cond, label %bb10, label %bb11 bb10: br label %bb11 bb11: - store i32 2, i32* %A, align 4 ; storeA2 + store i32 2, ptr %A, align 4 ; storeA2 br i1 %cond, label %bb12, label %bb13 bb12: br label %bb13 bb13: - store i32 4, i32* %B, align 4 ; StoreB1 + store i32 4, ptr %B, align 4 ; StoreB1 br i1 %cond, label %bb14, label %bb15 bb14: br label %bb15 bb15: - %tmp2 = load i32, i32* %A, align 4 ; loadA2 + %tmp2 = load i32, ptr %A, align 4 ; loadA2 br i1 %cond, label %bb16, label %bb17 bb16: br label %bb17 bb17: - %tmp3 = load i32, i32* %A, align 4 ; loadA3 + %tmp3 = load i32, ptr %A, align 4 ; loadA3 br i1 %cond, label %bb18, label %bb19 bb18: br label %bb19 bb19: - %tmp4 = load i32, i32* %B, align 4 ; loadB2 + %tmp4 = load i32, ptr %B, align 4 ; loadB2 br i1 %cond, label %bb20, label %bb21 bb20: br label %bb21 bb21: - %tmp5 = load i32, i32* %B, align 4 ; loadB3 + %tmp5 = load i32, ptr %B, align 4 ; loadB3 ret void })"); run(*M, "dependence", diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index 4908eda16e002..c37ed5d8613b0 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -183,7 +183,7 @@ TEST(Local, MergeBasicBlockIntoOnlyPred) { auto resetIR = [&]() { M = parseIR(C, R"( - define i32 @f(i8* %str) { + define i32 @f(ptr %str) { entry: br label %bb2.i bb2.i: ; preds = %bb4.i, %entry @@ -411,7 +411,7 @@ TEST(Local, ConstantFoldTerminator) { define void @indirectbr() { entry: - indirectbr i8* blockaddress(@indirectbr, %bb0), [label %bb0, label %bb1] + indirectbr ptr blockaddress(@indirectbr, %bb0), [label %bb0, label %bb1] bb0: ret void bb1: @@ -420,14 +420,14 @@ TEST(Local, ConstantFoldTerminator) { define void @indirectbr_repeated() { entry: - indirectbr i8* blockaddress(@indirectbr_repeated, %bb0), [label %bb0, label %bb0] + indirectbr ptr blockaddress(@indirectbr_repeated, %bb0), [label %bb0, label %bb0] bb0: ret void } define void @indirectbr_unreachable() { entry: - indirectbr i8* blockaddress(@indirectbr_unreachable, %bb0), [label %bb1] + indirectbr ptr blockaddress(@indirectbr_unreachable, %bb0), [label %bb1] bb0: ret void bb1: @@ -925,7 +925,7 @@ TEST(Local, RemoveUnreachableBlocks) { declare i32 @__gxx_personality_v0(...) - define void @invoke_terminator() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + define void @invoke_terminator() personality ptr @__gxx_personality_v0 { entry: br i1 undef, label %invoke.block, label %exit @@ -943,8 +943,8 @@ TEST(Local, RemoveUnreachableBlocks) { unreachable lpad.block: - %lp = landingpad { i8*, i32 } - catch i8* null + %lp = landingpad { ptr, i32 } + catch ptr null br label %exit exit: diff --git a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp index b97bc311f4655..dd03b4f2ae971 100644 --- a/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp +++ b/llvm/unittests/Transforms/Utils/MemTransferLowering.cpp @@ -98,13 +98,13 @@ struct MemTransferLowerTest : public testing::Test { // For that reason expandMemCpyAsLoop is expected to explicitly mark // loads from source and stores to destination as not aliasing. TEST_F(MemTransferLowerTest, MemCpyKnownLength) { - ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8 *, i64, i1)\n" - "define void @foo(i8* %dst, i8* %src, i64 %n) optsize {\n" + ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1)\n" + "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n" "entry:\n" - " %is_not_equal = icmp ne i8* %dst, %src\n" + " %is_not_equal = icmp ne ptr %dst, %src\n" " br i1 %is_not_equal, label %memcpy, label %exit\n" "memcpy:\n" - " call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, " + " call void @llvm.memcpy.p0i8.p0i8.i64(ptr %dst, ptr %src, " "i64 1024, i1 false)\n" " br label %exit\n" "exit:\n" @@ -138,13 +138,13 @@ TEST_F(MemTransferLowerTest, MemCpyKnownLength) { // llvm.memcpy lowering) doesn't alias by making sure the loop can be // successfully vectorized without additional runtime checks. TEST_F(MemTransferLowerTest, VecMemCpyKnownLength) { - ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8 *, i64, i1)\n" - "define void @foo(i8* %dst, i8* %src, i64 %n) optsize {\n" + ParseAssembly("declare void @llvm.memcpy.p0i8.p0i8.i64(ptr, ptr, i64, i1)\n" + "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n" "entry:\n" - " %is_not_equal = icmp ne i8* %dst, %src\n" + " %is_not_equal = icmp ne ptr %dst, %src\n" " br i1 %is_not_equal, label %memcpy, label %exit\n" "memcpy:\n" - " call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, " + " call void @llvm.memcpy.p0i8.p0i8.i64(ptr %dst, ptr %src, " "i64 1024, i1 false)\n" " br label %exit\n" "exit:\n" @@ -176,16 +176,16 @@ TEST_F(MemTransferLowerTest, VecMemCpyKnownLength) { TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) { ParseAssembly("declare void " - "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32*, " + "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr, " "i32 *, i64, i32)\n" - "define void @foo(i32* %dst, i32* %src, i64 %n) optsize {\n" + "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n" "entry:\n" - " %is_not_equal = icmp ne i32* %dst, %src\n" + " %is_not_equal = icmp ne ptr %dst, %src\n" " br i1 %is_not_equal, label %memcpy, label %exit\n" "memcpy:\n" " call void " - "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32* " - "%dst, i32* %src, " + "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr " + "%dst, ptr %src, " "i64 1024, i32 4)\n" " br label %exit\n" "exit:\n" @@ -221,16 +221,16 @@ TEST_F(MemTransferLowerTest, AtomicMemCpyKnownLength) { TEST_F(MemTransferLowerTest, AtomicMemCpyUnKnownLength) { ParseAssembly("declare void " - "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32*, " + "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr, " "i32 *, i64, i32)\n" - "define void @foo(i32* %dst, i32* %src, i64 %n) optsize {\n" + "define void @foo(ptr %dst, ptr %src, i64 %n) optsize {\n" "entry:\n" - " %is_not_equal = icmp ne i32* %dst, %src\n" + " %is_not_equal = icmp ne ptr %dst, %src\n" " br i1 %is_not_equal, label %memcpy, label %exit\n" "memcpy:\n" " call void " - "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(i32* " - "%dst, i32* %src, " + "@llvm.memcpy.element.unordered.atomic.p0i32.p0i32.i64(ptr " + "%dst, ptr %src, " "i64 %n, i32 4)\n" " br label %exit\n" "exit:\n" diff --git a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp index 55eae64fe0f6d..4fe30805f9499 100644 --- a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp +++ b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp @@ -121,18 +121,18 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandPtrTypeSCEV) { TEST_F(ScalarEvolutionExpanderTest, SCEVZeroExtendExprNonIntegral) { /* * Create the following code: - * func(i64 addrspace(10)* %arg) + * func(ptr addrspace(10) %arg) * top: * br label %L.ph * L.ph: - * %gepbase = getelementptr i64 addrspace(10)* %arg, i64 1 + * %gepbase = getelementptr ptr addrspace(10) %arg, i64 1 * br label %L * L: * %phi = phi i64 [i64 0, %L.ph], [ %add, %L2 ] * %add = add i64 %phi2, 1 * br i1 undef, label %post, label %L2 * post: - * #= %gep = getelementptr i64 addrspace(10)* %gepbase, i64 %add =# + * #= %gep = getelementptr ptr addrspace(10) %gepbase, i64 %add =# * ret void * * We will create the appropriate SCEV expression for %gep and expand it, @@ -199,7 +199,7 @@ TEST_F(ScalarEvolutionExpanderTest, SCEVZeroExtendExprNonIntegral) { TEST_F(ScalarEvolutionExpanderTest, SCEVExpanderIsSafeToExpandAt) { /* * Create the following code: - * func(i64 addrspace(10)* %arg) + * func(ptr addrspace(10) %arg) * top: * br label %L.ph * L.ph: @@ -704,14 +704,14 @@ TEST_F(ScalarEvolutionExpanderTest, SCEVExpanderShlNSW) { EXPECT_FALSE(I->hasNoSignedWrap()); }; - checkOneCase("define void @f(i16* %arrayidx) { " - " %1 = load i16, i16* %arrayidx " + checkOneCase("define void @f(ptr %arrayidx) { " + " %1 = load i16, ptr %arrayidx " " %2 = and i16 %1, -32768 " " ret void " "} "); - checkOneCase("define void @f(i8* %arrayidx) { " - " %1 = load i8, i8* %arrayidx " + checkOneCase("define void @f(ptr %arrayidx) { " + " %1 = load i8, ptr %arrayidx " " %2 = and i8 %1, -128 " " ret void " "} "); diff --git a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp index eec10110e6af4..7ba259deb574c 100644 --- a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp +++ b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp @@ -34,7 +34,7 @@ TEST(LoopUnrollRuntime, Latch) { std::unique_ptr M = parseIR( C, - R"(define i32 @test(i32* %a, i32* %b, i32* %c, i64 %n) { + R"(define i32 @test(ptr %a, ptr %b, ptr %c, i64 %n) { entry: br label %while.cond @@ -44,13 +44,13 @@ while.cond: ; preds = %while.body, %entry br i1 %cmp, label %while.body, label %while.end while.body: ; preds = %while.cond - %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.0 - %0 = load i32, i32* %arrayidx - %arrayidx1 = getelementptr inbounds i32, i32* %c, i64 %i.0 - %1 = load i32, i32* %arrayidx1 + %arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.0 + %0 = load i32, ptr %arrayidx + %arrayidx1 = getelementptr inbounds i32, ptr %c, i64 %i.0 + %1 = load i32, ptr %arrayidx1 %mul = mul nsw i32 %0, %1 - %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.0 - store i32 %mul, i32* %arrayidx2 + %arrayidx2 = getelementptr inbounds i32, ptr %a, i64 %i.0 + store i32 %mul, ptr %arrayidx2 %inc = add nsw i64 %i.0, 1 br label %while.cond diff --git a/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp b/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp index e39cd7038b280..7f12deae2ad1b 100644 --- a/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp +++ b/llvm/unittests/Transforms/Utils/ValueMapperTest.cpp @@ -74,7 +74,7 @@ TEST(ValueMapperTest, mapMDNodeDuplicatedCycle) { // Create a cycle that references G0. MDNode *N0; // !0 = !{!1} - MDNode *N1; // !1 = !{!0, i8* @G0} + MDNode *N1; // !1 = !{!0, ptr @G0} { auto T0 = MDTuple::getTemporary(Context, nullptr); Metadata *Ops1[] = {T0.get(), ConstantAsMetadata::get(G0.get())}; diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index ee3cd8c20f8e7..32994c12aa98b 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -1103,7 +1103,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) { Twine ClassName = TargetName + "GenInstrInfo"; OS << "struct " << ClassName << " : public TargetInstrInfo {\n" << " explicit " << ClassName - << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode = ~0u, " + << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, " + "unsigned CFSetupOpcode = ~0u, " "unsigned CFDestroyOpcode = ~0u, " "unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u);\n" << " ~" << ClassName << "() override = default;\n" @@ -1157,9 +1158,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) { << TargetName << "InstrComplexDeprecationInfos[];\n"; Twine ClassName = TargetName + "GenInstrInfo"; OS << ClassName << "::" << ClassName - << "(const TargetSubtargetInfo &STI, unsigned CFSetupOpcode, unsigned " + << "(const TargetSubtargetInfo &STI, const TargetRegisterInfo &TRI, " + "unsigned CFSetupOpcode, unsigned " "CFDestroyOpcode, unsigned CatchRetOpcode, unsigned ReturnOpcode)\n" - << " : TargetInstrInfo(CFSetupOpcode, CFDestroyOpcode, CatchRetOpcode, " + << " : TargetInstrInfo(TRI, CFSetupOpcode, CFDestroyOpcode, " + "CatchRetOpcode, " "ReturnOpcode"; if (NumClassesByHwMode != 0) OS << ", " << TargetName diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 1cc5b74a3cb67..63ad73370a439 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -512,8 +512,7 @@ def NVVM_ReduxOp : //===----------------------------------------------------------------------===// def NVVM_NanosleepOp : NVVM_Op<"nanosleep">, - Arguments<(ins - ConfinedAttr, IntMaxValue<1000000>]>:$duration)> + Arguments<(ins I32:$duration)> { let summary = "Suspends the thread for a specified duration."; @@ -531,8 +530,7 @@ def NVVM_NanosleepOp : NVVM_Op<"nanosleep">, string llvmBuilder = [{ createIntrinsicCall(builder, - llvm::Intrinsic::nvvm_nanosleep, - {builder.getInt32($duration)}); + llvm::Intrinsic::nvvm_nanosleep, {$duration}); }]; let assemblyFormat = "attr-dict $duration"; } diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td index 34f333e556deb..f5e4afad535e5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td @@ -161,4 +161,43 @@ def SetOpLayoutAttrOp : Op, + TransformOpInterface + ]> { + + let summary = "Set number of threads for a given gpu.launch operation"; + let description = [{ + Overrides the x,y,z threads operands of a given `gpu.launch` operation in-place. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target, + Variadic:$threads, + DefaultValuedOptionalAttr:$static_threads + ); + let results = (outs); + let builders = [ + OpBuilder<(ins "Value":$target, "ArrayRef":$mixedThreads)>, + ]; + + let assemblyFormat = [{ + $target + `threads` `=` custom($threads, $static_threads) + attr-dict `:` qualified(type(operands)) + }]; + + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure apply( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::transform::TransformResults &transformResults, + ::mlir::transform::TransformState &state); + + ::llvm::SmallVector<::mlir::OpFoldResult> getMixedThreads() { + Builder b(getContext()); + return getMixedValues(getStaticThreads(), getThreads(), b); + } + }]; +} + #endif // XEGPU_TRANSFORM_OPS diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp index 6200366cded29..e6adcde72ad66 100644 --- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp +++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp @@ -140,6 +140,9 @@ static bool resultIsNotRead(Operation *op, std::vector &uses) { std::vector opUses; for (OpOperand &use : op->getUses()) { Operation *useOp = use.getOwner(); + // Use escaped the scope + if (useOp->mightHaveTrait()) + return false; if (isa(useOp) || (useOp->getNumResults() == 0 && useOp->getNumRegions() == 0 && !mlir::hasEffect(useOp)) || diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp index 5fdd8534e4e51..7a7a8c9066f09 100644 --- a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp +++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" @@ -341,6 +342,69 @@ void transform::SetOpLayoutAttrOp::getEffects( modifiesPayload(effects); } +void transform::SetGPULaunchThreadsOp::build( + OpBuilder &builder, OperationState &ostate, Value target, + ArrayRef mixedThreads) { + SmallVector staticThreads; + SmallVector dynamicThreads; + dispatchIndexOpFoldResults(mixedThreads, dynamicThreads, staticThreads); + build(builder, ostate, target.getType(), + /*target=*/target, + /*threads=*/dynamicThreads, + /*static_threads=*/staticThreads); +} + +DiagnosedSilenceableFailure +transform::SetGPULaunchThreadsOp::apply(transform::TransformRewriter &rewriter, + transform::TransformResults &results, + transform::TransformState &state) { + auto targetOps = state.getPayloadOps(getTarget()); + if (!llvm::hasSingleElement(targetOps)) { + return emitDefiniteFailure() << "Requires exactly one targetOp handle (got " + << llvm::range_size(targetOps) << ")"; + } + Operation *target = *targetOps.begin(); + + auto launchOp = dyn_cast(target); + if (!launchOp) { + auto diag = emitSilenceableFailure(getLoc()) + << "Expected a gpu.launch op, but got: " << target->getName(); + diag.attachNote(target->getLoc()) << "target op"; + return diag; + } + + SmallVector threads; + DiagnosedSilenceableFailure status = + convertMixedValuesToInt(state, (*this), threads, getMixedThreads()); + if (!status.succeeded()) + return status; + + if (threads.size() != 3) { + return emitSilenceableFailure(getLoc()) + << "Expected threads argument to consist of three values (got " + << threads.size() << ")"; + } + + rewriter.setInsertionPoint(launchOp); + auto createConstValue = [&](int value) { + return arith::ConstantIndexOp::create(rewriter, launchOp.getLoc(), value); + }; + + // Replace threads in-place. + launchOp.getBlockSizeXMutable().assign(createConstValue(threads[0])); + launchOp.getBlockSizeYMutable().assign(createConstValue(threads[1])); + launchOp.getBlockSizeZMutable().assign(createConstValue(threads[2])); + + return DiagnosedSilenceableFailure::success(); +} + +void transform::SetGPULaunchThreadsOp::getEffects( + ::llvm::SmallVectorImpl &effects) { + onlyReadsHandle(getTargetMutable(), effects); + onlyReadsHandle(getThreadsMutable(), effects); + modifiesPayload(effects); +} + namespace { class XeGPUTransformDialectExtension : public transform::TransformDialectExtension< diff --git a/mlir/python/mlir/dialects/transform/xegpu.py b/mlir/python/mlir/dialects/transform/xegpu.py index ce8015d8f557b..309883cfc4518 100644 --- a/mlir/python/mlir/dialects/transform/xegpu.py +++ b/mlir/python/mlir/dialects/transform/xegpu.py @@ -132,3 +132,39 @@ def __init__( loc=loc, ip=ip, ) + + +class SetGPULaunchThreadsOp(SetGPULaunchThreadsOp): + """Specialization for SetGPULaunchThreadsOp class.""" + + def __init__( + self, + launch_op: Union[Operation, Value], + threads: MixedValues, + *, + loc=None, + ip=None, + ): + ( + dynamic_threads, + static_threads, + _, + ) = _dispatch_dynamic_index_list(threads) + + super().__init__( + _get_op_result_or_value(launch_op), + dynamic_threads, + static_threads=static_threads, + loc=loc, + ip=ip, + ) + + +def set_gpu_launch_threads( + launch_op: Union[Operation, Value], + threads: MixedValues, + *, + loc=None, + ip=None, +) -> SetGPULaunchThreadsOp: + return SetGPULaunchThreadsOp(launch_op, threads, loc=loc, ip=ip) diff --git a/mlir/test/Dialect/MemRef/transform-ops.mlir b/mlir/test/Dialect/MemRef/transform-ops.mlir index 3b37c62fcb28e..6e130912c47e9 100644 --- a/mlir/test/Dialect/MemRef/transform-ops.mlir +++ b/mlir/test/Dialect/MemRef/transform-ops.mlir @@ -306,6 +306,23 @@ module attributes {transform.with_named_sequence} { // ----- +// CHECK-LABEL: func.func @dead_alloc_escaped +func.func @dead_alloc_escaped() -> memref<8x64xf32, 3> { + // CHECK: %{{.+}} = memref.alloc + %0 = memref.alloc() : memref<8x64xf32, 3> + return %0 : memref<8x64xf32, 3> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.memref.erase_dead_alloc_and_stores %0 : (!transform.any_op) -> () + transform.yield + } +} + +// ----- + // CHECK-LABEL: func.func @dead_alloc func.func @dead_alloc() { // CHECK-NOT: %{{.+}} = memref.alloc diff --git a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir index 726b6748452ae..24f500658f740 100644 --- a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir +++ b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir @@ -71,3 +71,56 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +func.func @set_gpu_launch_threads_bad_handle(%arg0: memref<4096x4096xf16>) { + %c32 = arith.constant 32 : index // expected-note {{target op}} + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error@below {{Expected a gpu.launch op, but got: arith.constant}} + transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @set_gpu_launch_threads_many_handles(%arg0: memref<4096x4096xf16>) { + %c32 = arith.constant 32 : index + %c64 = arith.constant 64 : index + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.constant"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error@below {{Requires exactly one targetOp handle (got 2)}} + transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @set_gpu_launch_threads_bad_threads(%arg0: memref<4096x4096xf16>) { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { + gpu.terminator + } + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error@below {{Expected threads argument to consist of three values (got 2)}} + transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4] : !transform.any_op + transform.yield + } +} diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir index bd6a79244ed30..7f2fbe4271a43 100644 --- a/mlir/test/Dialect/XeGPU/transform-ops.mlir +++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir @@ -230,6 +230,7 @@ module attributes {transform.with_named_sequence} { transform.yield } } + // ----- // CHECK-LABEL: @set_op_layout_attr_operand1 @@ -252,3 +253,58 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +// CHECK-LABEL: @set_gpu_launch_threads +func.func @set_gpu_launch_threads(%arg0: memref<4096x4096xf16>) { + // CHECK: %[[C1:.+]] = arith.constant 1 : index + %c1 = arith.constant 1 : index + // CHECK: %[[C16:.+]] = arith.constant 16 : index + %c16 = arith.constant 16 : index + // CHECK: %[[C8:.+]] = arith.constant 8 : index + // CHECK: %[[C4:.+]] = arith.constant 4 : index + // CHECK: %[[C1_0:.+]] = arith.constant 1 : index + // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C16]], %{{.*}} = %[[C16]], %{{.*}} = %[[C1]]) + // CHECK-SAME: threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C8]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1_0]]) + gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { + gpu.terminator + } + return +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.xegpu.set_gpu_launch_threads %{{.*}} + transform.xegpu.set_gpu_launch_threads %0 threads = [8, 4, 1] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @set_gpu_launch_threads_param +func.func @set_gpu_launch_threads_param(%arg0: memref<4096x4096xf16>) { + // CHECK: %[[C1:.+]] = arith.constant 1 : index + %c1 = arith.constant 1 : index + // CHECK: %[[C16:.+]] = arith.constant 16 : index + %c16 = arith.constant 16 : index + // CHECK: %[[C8:.+]] = arith.constant 8 : index + // CHECK: %[[C4:.+]] = arith.constant 4 : index + // CHECK: %[[C1_0:.+]] = arith.constant 1 : index + // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C16]], %{{.*}} = %[[C16]], %{{.*}} = %[[C1]]) + // CHECK-SAME: threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %[[C8]], %{{.*}} = %[[C4]], %{{.*}} = %[[C1_0]]) + gpu.launch blocks(%arg3, %arg4, %arg5) in (%arg9 = %c16, %arg10 = %c16, %arg11 = %c1) threads(%arg6, %arg7, %arg8) in (%arg12 = %c1, %arg13 = %c1, %arg14 = %c1) { + gpu.terminator + } + return +} +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["gpu.launch"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.xegpu.set_gpu_launch_threads %{{.*}} + %th1 = transform.param.constant 4 : i64 -> !transform.param + transform.xegpu.set_gpu_launch_threads %0 threads = [8, %th1, 1] : !transform.any_op, !transform.param + transform.yield + } +} diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir index 42aa2210eae1a..d5868ee73cc50 100644 --- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir @@ -578,14 +578,6 @@ llvm.func @ld_matrix(%arg0: !llvm.ptr<3>) { // ----- -llvm.func @nanosleep() { - // expected-error@+1 {{integer constant out of range for attribute}} - nvvm.nanosleep 100000000000000 - llvm.return -} - -// ----- - llvm.func @clusterlaunchcontrol_query_cancel_is_canceled_invalid_return_type(%try_cancel_response: i128) { // expected-error@+1 {{'nvvm.clusterlaunchcontrol.query.cancel' op is_canceled query type returns an i1}} %res = nvvm.clusterlaunchcontrol.query.cancel query = is_canceled, %try_cancel_response : i32 diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 1ec55408e97a5..fec54cbf5e3e5 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -970,8 +970,8 @@ llvm.func @nvvm_pmevent() { // ----- // CHECK-LABEL: @nanosleep -llvm.func @nanosleep() { - // CHECK: call void @llvm.nvvm.nanosleep(i32 4000) - nvvm.nanosleep 4000 +llvm.func @nanosleep(%duration: i32) { + // CHECK: call void @llvm.nvvm.nanosleep(i32 %{{.*}}) + nvvm.nanosleep %duration llvm.return } diff --git a/mlir/test/python/dialects/transform_xegpu_ext.py b/mlir/test/python/dialects/transform_xegpu_ext.py index 0b587d2020aa6..dc91f5e982579 100644 --- a/mlir/test/python/dialects/transform_xegpu_ext.py +++ b/mlir/test/python/dialects/transform_xegpu_ext.py @@ -113,3 +113,18 @@ def setOpLayoutAttrResult(): # CHECK: sg_layout = [6, 4] # CHECK: sg_data = [32, 16] # CHECK: inst_data = [8, 16] + + +@run +def setGPULaunchThreadsOp(): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, + [], + transform.OperationType.get("gpu.launch"), + ) + with InsertionPoint(sequence.body): + xegpu.set_gpu_launch_threads(sequence.bodyTarget, threads=[8, 4, 1]) + transform.YieldOp() + # CHECK-LABEL: TEST: setGPULaunchThreadsOp + # CHECK: transform.xegpu.set_gpu_launch_threads + # CHECK: threads = [8, 4, 1] diff --git a/orc-rt/include/orc-rt-c/WrapperFunction.h b/orc-rt/include/orc-rt-c/WrapperFunction.h index 280e513c9c0e6..fefdf03ff3f06 100644 --- a/orc-rt/include/orc-rt-c/WrapperFunction.h +++ b/orc-rt/include/orc-rt-c/WrapperFunction.h @@ -54,7 +54,7 @@ typedef struct { * Asynchronous return function for an orc-rt wrapper function. */ typedef void (*orc_rt_WrapperFunctionReturn)( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionBuffer ResultBytes); /** @@ -62,10 +62,11 @@ typedef void (*orc_rt_WrapperFunctionReturn)( * * ArgBytes contains the serialized arguments for the wrapper function. * Session holds a reference to the session object. - * CallCtx holds a pointer to the context object for this particular call. + * CallId holds a pointer to the context object for this particular call. * Return holds a pointer to the return function. */ -typedef void (*orc_rt_WrapperFunction)(orc_rt_SessionRef Session, void *CallCtx, +typedef void (*orc_rt_WrapperFunction)(orc_rt_SessionRef Session, + uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes); diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h index 46c08a0c688d0..1ae2c130dc0cf 100644 --- a/orc-rt/include/orc-rt/SPSWrapperFunction.h +++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h @@ -124,10 +124,10 @@ template struct SPSWrapperFunction { } template - static void handle(orc_rt_SessionRef Session, void *CallCtx, + static void handle(orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, WrapperFunctionBuffer ArgBytes, Handler &&H) { - WrapperFunction::handle(Session, CallCtx, Return, std::move(ArgBytes), + WrapperFunction::handle(Session, CallId, Return, std::move(ArgBytes), WrapperFunctionSPSSerializer(), std::forward(H)); } diff --git a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h index cf0e4ac732ca0..20b080e960dea 100644 --- a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h +++ b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h @@ -114,21 +114,21 @@ class SimpleNativeMemoryMap : public ResourceManager { } // namespace orc_rt ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes); ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes); ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes); ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes); #endif // ORC_RT_SIMPLENATIVEMEMORYMAP_H diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h index 6e8b84e980dc0..494e2a0dc0bb1 100644 --- a/orc-rt/include/orc-rt/WrapperFunction.h +++ b/orc-rt/include/orc-rt/WrapperFunction.h @@ -137,14 +137,14 @@ using WFHandlerTraits = CallableTraitsHelper; template class StructuredYieldBase { public: - StructuredYieldBase(orc_rt_SessionRef Session, void *CallCtx, + StructuredYieldBase(orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, Serializer &&S) - : Session(Session), CallCtx(CallCtx), Return(Return), + : Session(Session), CallId(CallId), Return(Return), S(std::forward(S)) {} protected: orc_rt_SessionRef Session; - void *CallCtx; + uint64_t CallId; orc_rt_WrapperFunctionReturn Return; std::decay_t S; }; @@ -158,9 +158,9 @@ class StructuredYield, Serializer> using StructuredYieldBase::StructuredYieldBase; void operator()(RetT &&R) { if (auto ResultBytes = this->S.result().serialize(std::forward(R))) - this->Return(this->Session, this->CallCtx, ResultBytes->release()); + this->Return(this->Session, this->CallId, ResultBytes->release()); else - this->Return(this->Session, this->CallCtx, + this->Return(this->Session, this->CallId, WrapperFunctionBuffer::createOutOfBandError( "Could not serialize wrapper function result data") .release()); @@ -173,7 +173,7 @@ class StructuredYield, Serializer> public: using StructuredYieldBase::StructuredYieldBase; void operator()() { - this->Return(this->Session, this->CallCtx, + this->Return(this->Session, this->CallId, WrapperFunctionBuffer().release()); } }; @@ -251,12 +251,12 @@ struct WrapperFunction { /// /// /// static void adder_add_async_sps_wrapper( - /// orc_rt_SessionRef Session, void *CallCtx, + /// orc_rt_SessionRef Session, uint64_t CallId, /// orc_rt_WrapperFunctionReturn Return, /// orc_rt_WrapperFunctionBuffer ArgBytes) { /// using SPSSig = SPSString(SPSExecutorAddr, int32_t, bool); /// SPSWrapperFunction::handle( - /// Session, CallCtx, Return, ArgBytes, + /// Session, CallId, Return, ArgBytes, /// WrapperFunction::handleWithAsyncMethod(&MyClass::myMethod)); /// } /// @endcode @@ -313,12 +313,12 @@ struct WrapperFunction { /// /// /// static void adder_add_sync_sps_wrapper( - /// orc_rt_SessionRef Session, void *CallCtx, + /// orc_rt_SessionRef Session, uint64_t CallId, /// orc_rt_WrapperFunctionReturn Return, /// orc_rt_WrapperFunctionBuffer ArgBytes) { /// using SPSSig = SPSString(SPSExecutorAddr, int32_t, bool); /// SPSWrapperFunction::handle( - /// Session, CallCtx, Return, ArgBytes, + /// Session, CallId, Return, ArgBytes, /// WrapperFunction::handleWithSyncMethod(&Adder::addSync)); /// } /// @endcode @@ -368,7 +368,7 @@ struct WrapperFunction { /// This utility deserializes and serializes arguments and return values /// (using the given Serializer), and calls the given handler. template - static void handle(orc_rt_SessionRef Session, void *CallCtx, + static void handle(orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, WrapperFunctionBuffer ArgBytes, Serializer &&S, Handler &&H) { @@ -380,16 +380,16 @@ struct WrapperFunction { typedef typename CallableArgInfo::args_tuple_type RetTupleType; if (ArgBytes.getOutOfBandError()) - return Return(Session, CallCtx, ArgBytes.release()); + return Return(Session, CallId, ArgBytes.release()); if (auto Args = S.arguments().template deserialize(ArgBytes)) std::apply(HandlerTraits::forwardArgsAsRequested(bind_front( std::forward(H), detail::StructuredYield( - Session, CallCtx, Return, std::move(S)))), + Session, CallId, Return, std::move(S)))), *Args); else - Return(Session, CallCtx, + Return(Session, CallId, WrapperFunctionBuffer::createOutOfBandError( "Could not deserialize wrapper function arg data") .release()); diff --git a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp index 5d410acbc65d9..c6a005d6a70e6 100644 --- a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp +++ b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp @@ -367,45 +367,45 @@ Error SimpleNativeMemoryMap::recordDeallocActions( } ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { using Sig = SPSExpected(SPSExecutorAddr, SPSSize); SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, WrapperFunction::handleWithAsyncMethod(&SimpleNativeMemoryMap::reserve)); } ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { using Sig = SPSError(SPSExecutorAddr, SPSSequence); - SPSWrapperFunction::handle(Session, CallCtx, Return, ArgBytes, + SPSWrapperFunction::handle(Session, CallId, Return, ArgBytes, WrapperFunction::handleWithAsyncMethod( &SimpleNativeMemoryMap::releaseMultiple)); } ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { using Sig = SPSExpected( SPSExecutorAddr, SPSSimpleNativeMemoryMapInitializeRequest); - SPSWrapperFunction::handle(Session, CallCtx, Return, ArgBytes, + SPSWrapperFunction::handle(Session, CallId, Return, ArgBytes, WrapperFunction::handleWithAsyncMethod( &SimpleNativeMemoryMap::initialize)); } ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper( - orc_rt_SessionRef Session, void *CallCtx, + orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { using Sig = SPSError(SPSExecutorAddr, SPSSequence); SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, WrapperFunction::handleWithAsyncMethod( &SimpleNativeMemoryMap::deinitializeMultiple)); } diff --git a/orc-rt/unittests/DirectCaller.h b/orc-rt/unittests/DirectCaller.h index 7c5c9d300d882..fab006717c157 100644 --- a/orc-rt/unittests/DirectCaller.h +++ b/orc-rt/unittests/DirectCaller.h @@ -22,10 +22,11 @@ class DirectCaller { virtual ~DirectResultSender() {} virtual void send(orc_rt_SessionRef Session, orc_rt::WrapperFunctionBuffer ResultBytes) = 0; - static void send(orc_rt_SessionRef Session, void *CallCtx, + static void send(orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionBuffer ResultBytes) { std::unique_ptr( - reinterpret_cast(CallCtx)) + reinterpret_cast( + static_cast(CallId))) ->send(Session, ResultBytes); } }; @@ -59,7 +60,8 @@ class DirectCaller { orc_rt::WrapperFunctionBuffer ArgBytes) { auto DR = makeDirectResultSender(std::forward(HandleResult)); - Fn(Session, reinterpret_cast(DR.release()), + Fn(Session, + static_cast(reinterpret_cast(DR.release())), DirectResultSender::send, ArgBytes.release()); } diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp index 81e5755e821f3..3b1592d89d8f1 100644 --- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp +++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp @@ -22,11 +22,11 @@ using namespace orc_rt; -static void void_noop_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx, +static void void_noop_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, [](move_only_function Return) { Return(); }); } @@ -40,11 +40,12 @@ TEST(SPSWrapperFunctionUtilsTest, VoidNoop) { EXPECT_TRUE(Ran); } -static void add_via_lambda_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx, +static void add_via_lambda_sps_wrapper(orc_rt_SessionRef Session, + uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, [](move_only_function Return, int32_t X, int32_t Y) { Return(X + Y); }); @@ -64,11 +65,11 @@ static void add_via_function(move_only_function Return, } static void -add_via_function_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx, +add_via_function_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, add_via_function); + Session, CallId, Return, ArgBytes, add_via_function); } TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunction) { @@ -80,11 +81,11 @@ TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunction) { } static void -add_via_function_pointer_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx, +add_via_function_pointer_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, &add_via_function); + Session, CallId, Return, ArgBytes, &add_via_function); } TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunctionPointer) { @@ -96,11 +97,12 @@ TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunctionPointer) { } static void -round_trip_string_via_span_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx, +round_trip_string_via_span_sps_wrapper(orc_rt_SessionRef Session, + uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, [](move_only_function Return, span S) { Return({S.data(), S.size()}); }); @@ -119,11 +121,11 @@ TEST(SPSWrapperFunctionUtilsTest, RoundTripStringViaSpan) { } static void improbable_feat_sps_wrapper(orc_rt_SessionRef Session, - void *CallCtx, + uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, [](move_only_function Return, bool LuckyHat) { if (LuckyHat) Return(Error::success()); @@ -155,11 +157,11 @@ TEST(SPSWrapperFunctionUtilsTest, TransparentConversionErrorFailureCase) { EXPECT_EQ(ErrMsg, "crushed by boulder"); } -static void halve_number_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx, +static void halve_number_sps_wrapper(orc_rt_SessionRef Session, uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction(int32_t)>::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, [](move_only_function)> Return, int N) { if (N % 2 == 0) Return(N >> 1); @@ -208,12 +210,12 @@ class SPSSerializationTraits, OpCounter> { static void handle_with_reference_types_sps_wrapper(orc_rt_SessionRef Session, - void *CallCtx, + uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction, SPSOpCounter<1>, SPSOpCounter<2>, - SPSOpCounter<3>)>::handle(Session, CallCtx, Return, ArgBytes, + SPSOpCounter<3>)>::handle(Session, CallId, Return, ArgBytes, [](move_only_function Return, OpCounter<0>, OpCounter<1> &, const OpCounter<2> &, @@ -281,11 +283,11 @@ class Adder { } // anonymous namespace static void adder_add_async_sps_wrapper(orc_rt_SessionRef Session, - void *CallCtx, + uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, WrapperFunction::handleWithAsyncMethod(&Adder::addAsync)); } @@ -300,11 +302,12 @@ TEST(SPSWrapperFunctionUtilsTest, HandleWtihAsyncMethod) { EXPECT_EQ(Result, 42); } -static void adder_add_sync_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx, +static void adder_add_sync_sps_wrapper(orc_rt_SessionRef Session, + uint64_t CallId, orc_rt_WrapperFunctionReturn Return, orc_rt_WrapperFunctionBuffer ArgBytes) { SPSWrapperFunction::handle( - Session, CallCtx, Return, ArgBytes, + Session, CallId, Return, ArgBytes, WrapperFunction::handleWithSyncMethod(&Adder::addSync)); } diff --git a/revert_patches.txt b/revert_patches.txt index b88b846f64b68..6be344f862546 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -8,3 +8,6 @@ breaks build of ROCmValidationSuite breaks fortran declare-target-link1 [OMPIRBuilder] Fix addrspace of internal critical section lock (#166459 --- +complicated to land parallel-EQ +Reland "[clang] Refactor option-related code from clangDriver into new clangOptions library" (#167374) +--- diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 4d279bffd3723..4ee23d19abf12 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1231,6 +1231,7 @@ cc_library( ":format", ":frontend", ":lex", + ":options", ":rewrite", ":support", ":tooling_core", @@ -1507,6 +1508,19 @@ gentbl_cc_library( deps = ["//llvm:OptParserTdFiles"], ) +cc_library( + name = "options", + srcs = glob(["lib/Options/*.cpp"]), + hdrs = glob(["include/clang/Options/*.h"]), + includes = ["include"], + deps = [ + ":basic", + ":driver_options_inc_gen", + ":static_analyzer_checkers_gen", + "//llvm:Option", + ], +) + cc_library( name = "driver", srcs = glob( @@ -1544,6 +1558,7 @@ cc_library( ":config", ":driver_options_inc_gen", ":lex", + ":options", ":parse", ":static_analyzer_checkers_gen", "//llvm:BinaryFormat", @@ -1700,6 +1715,7 @@ cc_library( ":driver_options_inc_gen", ":edit", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -1769,6 +1785,7 @@ cc_library( ":frontend", ":frontend_tool", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -2001,6 +2018,7 @@ cc_library( ":extract_api", ":frontend", ":frontend_rewrite", + ":options", ":static_analyzer_frontend", "//llvm:Option", "//llvm:Support", @@ -2176,6 +2194,7 @@ cc_library( ":frontend_rewrite", ":frontend_tool", ":lex", + ":options", ":parse", ":sema", ":serialization", @@ -2258,6 +2277,7 @@ cc_binary( ":driver", ":frontend", ":frontend_rewrite", + ":options", ":serialization", ":static_analyzer_frontend", ":tooling",