diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 9ac196ad0e821..04db160b64b05 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -163,8 +163,8 @@ if (BOLT_ENABLE_RUNTIME) add_llvm_install_targets(install-bolt_rt DEPENDS bolt_rt bolt COMPONENT bolt) - set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_instr.a") - set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib/libbolt_rt_hugify.a") + set(LIBBOLT_RT_INSTR "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a") + set(LIBBOLT_RT_HUGIFY "${CMAKE_CURRENT_BINARY_DIR}/bolt_rt-bins/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a") endif() find_program(GNU_LD_EXECUTABLE NAMES ${LLVM_DEFAULT_TARGET_TRIPLE}-ld.bfd ld.bfd DOC "GNU ld") diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp index f004a8eeea185..1793f4ff1f148 100644 --- a/bolt/lib/Passes/Inliner.cpp +++ b/bolt/lib/Passes/Inliner.cpp @@ -310,13 +310,13 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB, if (MIB.isPseudo(Inst)) continue; - MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86()); + MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86() || BC.isAArch64()); // Fix branch target. Strictly speaking, we don't have to do this as // targets of direct branches will be fixed later and don't matter // in the CFG state. However, disassembly may look misleading, and // hence we do the fixing. - if (MIB.isBranch(Inst)) { + if (MIB.isBranch(Inst) && !MIB.isTailCall(Inst)) { assert(!MIB.isIndirectBranch(Inst) && "unexpected indirect branch in callee"); const BinaryBasicBlock *TargetBB = diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index d752751c17932..d84da10b5bbe6 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -133,6 +133,36 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { public: using MCPlusBuilder::MCPlusBuilder; + MCPhysReg getStackPointer() const override { return AArch64::SP; } + + bool isPush(const MCInst &Inst) const override { return false; } + + bool isPop(const MCInst &Inst) const override { return false; } + + void createCall(MCInst &Inst, const MCSymbol *Target, + MCContext *Ctx) override { + createDirectCall(Inst, Target, Ctx, false); + } + + bool convertTailCallToCall(MCInst &Inst) override { + int NewOpcode; + switch (Inst.getOpcode()) { + default: + return false; + case AArch64::B: + NewOpcode = AArch64::BL; + break; + case AArch64::BR: + NewOpcode = AArch64::BLR; + break; + } + + Inst.setOpcode(NewOpcode); + removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall); + clearOffset(Inst); + return true; + } + bool equals(const MCTargetExpr &A, const MCTargetExpr &B, CompFuncTy Comp) const override { const auto &AArch64ExprA = cast(A); diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 40f4fbc9f30d5..0deb69a27d435 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -16,18 +16,18 @@ add_library(bolt_rt_instr STATIC instr.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h ) -set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") +set_target_properties(bolt_rt_instr PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}") add_library(bolt_rt_hugify STATIC hugify.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h ) -set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") +set_target_properties(bolt_rt_hugify PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}") if(NOT BOLT_BUILT_STANDALONE) add_custom_command(TARGET bolt_rt_instr POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}") + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr.a" "${LLVM_LIBRARY_DIR}") add_custom_command(TARGET bolt_rt_hugify POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}") + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_hugify.a" "${LLVM_LIBRARY_DIR}") endif() set(BOLT_RT_FLAGS @@ -53,23 +53,23 @@ target_include_directories(bolt_rt_instr PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_compile_options(bolt_rt_hugify PRIVATE ${BOLT_RT_FLAGS}) target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) -install(TARGETS bolt_rt_instr DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") -install(TARGETS bolt_rt_hugify DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") +install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") +install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") add_library(bolt_rt_instr_osx STATIC instr.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h ) - set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") + set_target_properties(bolt_rt_instr_osx PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}") target_include_directories(bolt_rt_instr_osx PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) target_compile_options(bolt_rt_instr_osx PRIVATE -target x86_64-apple-darwin19.6.0 ${BOLT_RT_FLAGS}) - install(TARGETS bolt_rt_instr_osx DESTINATION "${CMAKE_INSTALL_LIBDIR}${LLVM_LIBDIR_SUFFIX}") + install(TARGETS bolt_rt_instr_osx DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") if(NOT BOLT_BUILT_STANDALONE) add_custom_command(TARGET bolt_rt_instr_osx POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}") + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/libbolt_rt_instr_osx.a" "${LLVM_LIBRARY_DIR}") endif() endif() diff --git a/bolt/test/AArch64/inline-small-function-1.s b/bolt/test/AArch64/inline-small-function-1.s new file mode 100644 index 0000000000000..3ea22a9915fb4 --- /dev/null +++ b/bolt/test/AArch64/inline-small-function-1.s @@ -0,0 +1,42 @@ +## This test checks that inline is properly handled by BOLT on aarch64. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt --inline-small-functions --print-inline --print-only=_Z3barP1A \ +# RUN: %t.exe -o %t.bolt | FileCheck %s + +# CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes. +# CHECK: Binary Function "_Z3barP1A" after inlining { +# CHECK-NOT: bl _Z3fooP1A +# CHECK: ldr x8, [x0] +# CHECK-NEXT: ldr w0, [x8] + + .text + .globl _Z3fooP1A + .type _Z3fooP1A,@function +_Z3fooP1A: + ldr x8, [x0] + ldr w0, [x8] + ret + .size _Z3fooP1A, .-_Z3fooP1A + + .globl _Z3barP1A + .type _Z3barP1A,@function +_Z3barP1A: + stp x29, x30, [sp, #-16]! + mov x29, sp + bl _Z3fooP1A + mul w0, w0, w0 + ldp x29, x30, [sp], #16 + ret + .size _Z3barP1A, .-_Z3barP1A + + .globl main + .p2align 2 + .type main,@function +main: + mov w0, wzr + ret + .size main, .-main diff --git a/bolt/test/AArch64/inline-small-function-2.s b/bolt/test/AArch64/inline-small-function-2.s new file mode 100644 index 0000000000000..5eb7d391fd157 --- /dev/null +++ b/bolt/test/AArch64/inline-small-function-2.s @@ -0,0 +1,48 @@ +## This test checks that inline is properly handled by BOLT on aarch64. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -O0 %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt --inline-small-functions --print-inline --print-only=test \ +# RUN: %t.exe -o %t.bolt | FileCheck %s + +#CHECK: BOLT-INFO: inlined 0 calls at 1 call sites in 2 iteration(s). Change in binary size: 4 bytes. +#CHECK: Binary Function "test" after inlining { +#CHECK-NOT: bl indirect +#CHECK: add w0, w1, w0 +#CHECK-NEXT: blr x2 + + .text + .globl indirect + .type indirect,@function +indirect: + add w0, w1, w0 + br x2 + .size indirect, .-indirect + + .globl test + .type test,@function +test: + stp x29, x30, [sp, #-32]! + stp x20, x19, [sp, #16] + mov x29, sp + mov w19, w1 + mov w20, w0 + bl indirect + add w8, w19, w20 + cmp w0, #0 + csinc w0, w8, wzr, eq + ldp x20, x19, [sp, #16] + ldp x29, x30, [sp], #32 + ret + .size test, .-test + + .globl main + .type main,@function +main: + mov w0, wzr + ret + .size main, .-main + + \ No newline at end of file diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg index e2fa0a4a2210f..d5a6849b27a77 100644 --- a/bolt/test/lit.local.cfg +++ b/bolt/test/lit.local.cfg @@ -1,5 +1,5 @@ host_linux_triple = config.target_triple.split("-")[0] + "-unknown-linux-gnu" -common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -pie" +common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie" flags = f"--target={host_linux_triple} -fPIE {common_linker_flags}" config.substitutions.insert(0, ("%cflags", f"%cflags {flags}")) diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index 3365ebe4d9012..bed532a84a1bd 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -60,6 +60,8 @@ class Attr : public AttributeCommonInfo { unsigned IsLateParsed : 1; LLVM_PREFERRED_TYPE(bool) unsigned InheritEvenIfAlreadyPresent : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned DeferDeserialization : 1; void *operator new(size_t bytes) noexcept { llvm_unreachable("Attrs cannot be allocated with regular 'new'."); @@ -80,10 +82,11 @@ class Attr : public AttributeCommonInfo { protected: Attr(ASTContext &Context, const AttributeCommonInfo &CommonInfo, - attr::Kind AK, bool IsLateParsed) + attr::Kind AK, bool IsLateParsed, bool DeferDeserialization = false) : AttributeCommonInfo(CommonInfo), AttrKind(AK), Inherited(false), IsPackExpansion(false), Implicit(false), IsLateParsed(IsLateParsed), - InheritEvenIfAlreadyPresent(false) {} + InheritEvenIfAlreadyPresent(false), + DeferDeserialization(DeferDeserialization) {} public: attr::Kind getKind() const { return static_cast(AttrKind); } @@ -105,6 +108,8 @@ class Attr : public AttributeCommonInfo { void setPackExpansion(bool PE) { IsPackExpansion = PE; } bool isPackExpansion() const { return IsPackExpansion; } + bool shouldDeferDeserialization() const { return DeferDeserialization; } + // Clone this attribute. Attr *clone(ASTContext &C) const; @@ -146,8 +151,9 @@ class InheritableAttr : public Attr { protected: InheritableAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo, attr::Kind AK, bool IsLateParsed, - bool InheritEvenIfAlreadyPresent) - : Attr(Context, CommonInfo, AK, IsLateParsed) { + bool InheritEvenIfAlreadyPresent, + bool DeferDeserialization = false) + : Attr(Context, CommonInfo, AK, IsLateParsed, DeferDeserialization) { this->InheritEvenIfAlreadyPresent = InheritEvenIfAlreadyPresent; } diff --git a/clang/include/clang/Basic/AllDiagnosticKinds.inc b/clang/include/clang/Basic/AllDiagnosticKinds.inc new file mode 100644 index 0000000000000..a946b4a640ac6 --- /dev/null +++ b/clang/include/clang/Basic/AllDiagnosticKinds.inc @@ -0,0 +1,33 @@ +//===--- AllDiagnosticKinds.inc----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Defines the Diagnostic IDs in ID sorted order. The order is dictated by +/// the enum in DiagnosticIDs.h#L49-L65. +/// +//===----------------------------------------------------------------------===// + +// Turn off clang-format, as the order of the includes are important to make +// sure tables based on Diagnostic IDs are partitioned/sorted based on +// DiagID. + +// clang-format off +#include "clang/Basic/DiagnosticCommonKinds.inc" +#include "clang/Basic/DiagnosticDriverKinds.inc" +#include "clang/Basic/DiagnosticFrontendKinds.inc" +#include "clang/Basic/DiagnosticSerializationKinds.inc" +#include "clang/Basic/DiagnosticLexKinds.inc" +#include "clang/Basic/DiagnosticParseKinds.inc" +#include "clang/Basic/DiagnosticASTKinds.inc" +#include "clang/Basic/DiagnosticCommentKinds.inc" +#include "clang/Basic/DiagnosticCrossTUKinds.inc" +#include "clang/Basic/DiagnosticSemaKinds.inc" +#include "clang/Basic/DiagnosticAnalysisKinds.inc" +#include "clang/Basic/DiagnosticRefactoringKinds.inc" +#include "clang/Basic/DiagnosticInstallAPIKinds.inc" +// clang-format on diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 408d3adf370c8..3969dd8af5dfa 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -713,6 +713,12 @@ class Attr { // attribute may be documented under multiple categories, more than one // Documentation entry may be listed. list Documentation; + // Set to true if deserialization of this attribute must be deferred until + // the parent Decl is fully deserialized (during header module file + // deserialization). E.g., this is the case for the preferred_name attribute, + // since its type deserialization depends on its target Decl type. + // (See https://github.com/llvm/llvm-project/issues/56490 for details). + bit DeferDeserialization = 0; } /// Used to define a set of mutually exclusive attributes. @@ -3254,6 +3260,11 @@ def PreferredName : InheritableAttr { let InheritEvenIfAlreadyPresent = 1; let MeaningfulToClassTemplateDefinition = 1; let TemplateDependent = 1; + // Type of this attribute depends on the target Decl type. + // Therefore, its deserialization must be deferred until + // deserialization of the target Decl is complete + // (for header modules). + let DeferDeserialization = 1; } def PreserveMost : DeclOrTypeAttr { diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 40dae25f7b54b..d568d2fd7aa30 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -740,6 +740,8 @@ enum ASTRecordTypes { CXX_ADDED_TEMPLATE_PARTIAL_SPECIALIZATION = 75, UPDATE_MODULE_LOCAL_VISIBLE = 76, + + UPDATE_TU_LOCAL_VISIBLE = 77, }; /// Record types used within a source manager block. @@ -1340,6 +1342,10 @@ enum DeclCode { /// only visible from DeclContext in the same module. DECL_CONTEXT_MODULE_LOCAL_VISIBLE, + /// A record that stores the set of declarations that are only visible + /// to the TU. + DECL_CONTEXT_TU_LOCAL_VISIBLE, + /// A LabelDecl record. DECL_LABEL, diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index d77bb01c5aa59..82564fe664acb 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -528,6 +528,7 @@ class ASTReader uint64_t LexicalOffset; uint64_t VisibleOffset; uint64_t ModuleLocalOffset; + uint64_t TULocalOffset; }; using DelayedNamespaceOffsetMapTy = @@ -640,6 +641,9 @@ class ASTReader llvm::DenseMap ModuleLocalLookups; + llvm::DenseMap + TULocalLookups; using SpecLookupTableTy = llvm::DenseMap PendingVisibleUpdates; llvm::DenseMap PendingModuleLocalVisibleUpdates; + llvm::DenseMap TULocalUpdates; using SpecializationsUpdate = SmallVector; using SpecializationsUpdateMap = @@ -704,11 +709,17 @@ class ASTReader llvm::BitstreamCursor &Cursor, uint64_t Offset, DeclContext *DC); + enum class VisibleDeclContextStorageKind { + GenerallyVisible, + ModuleLocalVisible, + TULocalVisible, + }; + /// Read the record that describes the visible contents of a DC. bool ReadVisibleDeclContextStorage(ModuleFile &M, llvm::BitstreamCursor &Cursor, uint64_t Offset, GlobalDeclID ID, - bool IsModuleLocal); + VisibleDeclContextStorageKind VisibleKind); bool ReadSpecializations(ModuleFile &M, llvm::BitstreamCursor &Cursor, uint64_t Offset, Decl *D, bool IsPartial); @@ -1148,6 +1159,10 @@ class ASTReader unsigned NumModuleLocalVisibleDeclContexts = 0, TotalModuleLocalVisibleDeclContexts = 0; + /// Number of TU Local decl contexts read/total + unsigned NumTULocalVisibleDeclContexts = 0, + TotalTULocalVisibleDeclContexts = 0; + /// Total size of modules, in bits, currently loaded uint64_t TotalModulesSizeInBits = 0; @@ -1221,6 +1236,24 @@ class ASTReader /// been completed. std::deque PendingDeclContextInfos; + /// Deserialization of some attributes must be deferred since they refer + /// to themselves in their type (e.g., preferred_name attribute refers to the + /// typedef that refers back to the template specialization of the template + /// that the attribute is attached to). + /// More attributes that store TypeSourceInfo might be potentially affected, + /// see https://github.com/llvm/llvm-project/issues/56490 for details. + struct DeferredAttribute { + // Index of the deferred attribute in the Record of the TargetedDecl. + uint64_t RecordIdx; + // Decl to attach a deferred attribute to. + Decl *TargetedDecl; + }; + + /// The collection of Decls that have been loaded but some of their attributes + /// have been deferred, paired with the index inside the record pointing + /// at the skipped attribute. + SmallVector PendingDeferredAttributes; + template using DuplicateObjCDecls = std::pair; @@ -1463,6 +1496,9 @@ class ASTReader const serialization::reader::ModuleLocalLookupTable * getModuleLocalLookupTables(DeclContext *Primary) const; + const serialization::reader::DeclContextLookupTable * + getTULocalLookupTables(DeclContext *Primary) const; + /// Get the loaded specializations lookup tables for \p D, /// if any. serialization::reader::LazySpecializationInfoLookupTable * @@ -1570,6 +1606,7 @@ class ASTReader void loadPendingDeclChain(Decl *D, uint64_t LocalOffset); void loadObjCCategories(GlobalDeclID ID, ObjCInterfaceDecl *D, unsigned PreviousGeneration = 0); + void loadDeferredAttribute(const DeferredAttribute &DA); RecordLocation getLocalBitOffset(uint64_t GlobalOffset); uint64_t getGlobalBitOffset(ModuleFile &M, uint64_t LocalOffset); diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h index 2561418b78ca7..a29972fcf73a8 100644 --- a/clang/include/clang/Serialization/ASTRecordReader.h +++ b/clang/include/clang/Serialization/ASTRecordReader.h @@ -83,6 +83,12 @@ class ASTRecordReader /// Returns the current value in this record, without advancing. uint64_t peekInt() { return Record[Idx]; } + /// Returns the next N values in this record, without advancing. + uint64_t peekInts(unsigned N) { return Record[Idx + N]; } + + /// Skips the current value. + void skipInt() { Idx += 1; } + /// Skips the specified number of values. void skipInts(unsigned N) { Idx += N; } @@ -335,7 +341,12 @@ class ASTRecordReader Attr *readAttr(); /// Reads attributes from the current stream position, advancing Idx. - void readAttributes(AttrVec &Attrs); + /// For some attributes (where type depends on itself recursively), defer + /// reading the attribute until the type has been read. + void readAttributes(AttrVec &Attrs, Decl *D = nullptr); + + /// Reads one attribute from the current stream position, advancing Idx. + Attr *readOrDeferAttrFor(Decl *D); /// Read an BTFTypeTagAttr object. BTFTypeTagAttr *readBTFTypeTagAttr() { diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h index 53b09cc914392..079e39a9fb678 100644 --- a/clang/include/clang/Serialization/ASTWriter.h +++ b/clang/include/clang/Serialization/ASTWriter.h @@ -496,6 +496,9 @@ class ASTWriter : public ASTDeserializationListener, /// file. unsigned NumModuleLocalDeclContexts = 0; + /// The number of TULocal declcontexts written to the AST file. + unsigned NumTULocalDeclContexts = 0; + /// A mapping from each known submodule to its ID number, which will /// be a positive integer. llvm::DenseMap SubmoduleIDs; @@ -594,12 +597,14 @@ class ASTWriter : public ASTDeserializationListener, void GenerateNameLookupTable(ASTContext &Context, const DeclContext *DC, llvm::SmallVectorImpl &LookupTable, - llvm::SmallVectorImpl &ModuleLocalLookupTable); + llvm::SmallVectorImpl &ModuleLocalLookupTable, + llvm::SmallVectorImpl &TULocalLookupTable); uint64_t WriteDeclContextLexicalBlock(ASTContext &Context, const DeclContext *DC); void WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC, uint64_t &VisibleBlockOffset, - uint64_t &ModuleLocalBlockOffset); + uint64_t &ModuleLocalBlockOffset, + uint64_t &TULocalBlockOffset); void WriteTypeDeclOffsets(); void WriteFileDeclIDsMap(); void WriteComments(ASTContext &Context); @@ -633,8 +638,10 @@ class ASTWriter : public ASTDeserializationListener, unsigned DeclContextLexicalAbbrev = 0; unsigned DeclContextVisibleLookupAbbrev = 0; unsigned DeclModuleLocalVisibleLookupAbbrev = 0; + unsigned DeclTULocalLookupAbbrev = 0; unsigned UpdateVisibleAbbrev = 0; unsigned ModuleLocalUpdateVisibleAbbrev = 0; + unsigned TULocalUpdateVisibleAbbrev = 0; unsigned DeclRecordAbbrev = 0; unsigned DeclTypedefAbbrev = 0; unsigned DeclVarAbbrev = 0; diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 4bfb80589620c..6677119d09211 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -4247,6 +4247,7 @@ bool Compiler::visitExpr(const Expr *E, bool DestroyToplevelScope) { // For us, that means everything we don't // have a PrimType for. if (std::optional LocalOffset = this->allocateLocal(E)) { + InitLinkScope ILS(this, InitLink::Temp(*LocalOffset)); if (!this->emitGetPtrLocal(*LocalOffset, E)) return false; diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index d77f28c80b2eb..81194bbf2538e 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -37,21 +37,7 @@ struct StaticDiagInfoDescriptionStringTable { #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ char ENUM##_desc[sizeof(DESC)]; - // clang-format off -#include "clang/Basic/DiagnosticCommonKinds.inc" -#include "clang/Basic/DiagnosticDriverKinds.inc" -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#include "clang/Basic/DiagnosticLexKinds.inc" -#include "clang/Basic/DiagnosticParseKinds.inc" -#include "clang/Basic/DiagnosticASTKinds.inc" -#include "clang/Basic/DiagnosticCommentKinds.inc" -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#include "clang/Basic/DiagnosticSemaKinds.inc" -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" - // clang-format on +#include "clang/Basic/AllDiagnosticKinds.inc" #undef DIAG }; @@ -59,21 +45,7 @@ const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = { #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ DESC, -// clang-format off -#include "clang/Basic/DiagnosticCommonKinds.inc" -#include "clang/Basic/DiagnosticDriverKinds.inc" -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#include "clang/Basic/DiagnosticLexKinds.inc" -#include "clang/Basic/DiagnosticParseKinds.inc" -#include "clang/Basic/DiagnosticASTKinds.inc" -#include "clang/Basic/DiagnosticCommentKinds.inc" -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#include "clang/Basic/DiagnosticSemaKinds.inc" -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" -// clang-format on +#include "clang/Basic/AllDiagnosticKinds.inc" #undef DIAG }; @@ -85,21 +57,7 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = { #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ offsetof(StaticDiagInfoDescriptionStringTable, ENUM##_desc), -// clang-format off -#include "clang/Basic/DiagnosticCommonKinds.inc" -#include "clang/Basic/DiagnosticDriverKinds.inc" -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#include "clang/Basic/DiagnosticLexKinds.inc" -#include "clang/Basic/DiagnosticParseKinds.inc" -#include "clang/Basic/DiagnosticASTKinds.inc" -#include "clang/Basic/DiagnosticCommentKinds.inc" -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#include "clang/Basic/DiagnosticSemaKinds.inc" -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" -// clang-format on +#include "clang/Basic/AllDiagnosticKinds.inc" #undef DIAG }; diff --git a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp index 5447b98d7105e..02635ce235a12 100644 --- a/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp +++ b/clang/lib/CodeGen/ObjectFilePCHContainerWriter.cpp @@ -81,6 +81,9 @@ class PCHContainerGenerator : public ASTConsumer { if (!TD->isCompleteDefinition()) return true; + if (D->hasAttr()) + return true; + QualType QualTy = Ctx.getTypeDeclType(D); if (!QualTy.isNull() && CanRepresent(QualTy.getTypePtr())) DI.getOrCreateStandaloneType(QualTy, D->getLocation()); diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 202227b195585..a72ff766685bb 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1425,10 +1425,9 @@ bool ASTReader::ReadLexicalDeclContextStorage(ModuleFile &M, return false; } -bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, - BitstreamCursor &Cursor, - uint64_t Offset, GlobalDeclID ID, - bool IsModuleLocal) { +bool ASTReader::ReadVisibleDeclContextStorage( + ModuleFile &M, BitstreamCursor &Cursor, uint64_t Offset, GlobalDeclID ID, + ASTReader::VisibleDeclContextStorageKind VisibleKind) { assert(Offset != 0); SavedStreamPosition SavedPosition(Cursor); @@ -1452,22 +1451,42 @@ bool ASTReader::ReadVisibleDeclContextStorage(ModuleFile &M, return true; } unsigned RecCode = MaybeRecCode.get(); - if (!IsModuleLocal && RecCode != DECL_CONTEXT_VISIBLE) { - Error("Expected visible lookup table block"); - return true; - } - if (IsModuleLocal && RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) { - Error("Expected module local visible lookup table block"); - return true; + switch (VisibleKind) { + case VisibleDeclContextStorageKind::GenerallyVisible: + if (RecCode != DECL_CONTEXT_VISIBLE) { + Error("Expected visible lookup table block"); + return true; + } + break; + case VisibleDeclContextStorageKind::ModuleLocalVisible: + if (RecCode != DECL_CONTEXT_MODULE_LOCAL_VISIBLE) { + Error("Expected module local visible lookup table block"); + return true; + } + break; + case VisibleDeclContextStorageKind::TULocalVisible: + if (RecCode != DECL_CONTEXT_TU_LOCAL_VISIBLE) { + Error("Expected TU local lookup table block"); + return true; + } + break; } // We can't safely determine the primary context yet, so delay attaching the // lookup table until we're done with recursive deserialization. auto *Data = (const unsigned char*)Blob.data(); - if (!IsModuleLocal) + switch (VisibleKind) { + case VisibleDeclContextStorageKind::GenerallyVisible: PendingVisibleUpdates[ID].push_back(UpdateData{&M, Data}); - else + break; + case VisibleDeclContextStorageKind::ModuleLocalVisible: PendingModuleLocalVisibleUpdates[ID].push_back(UpdateData{&M, Data}); + break; + case VisibleDeclContextStorageKind::TULocalVisible: + if (M.Kind == MK_MainFile) + TULocalUpdates[ID].push_back(UpdateData{&M, Data}); + break; + } return false; } @@ -3613,6 +3632,21 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; } + case UPDATE_TU_LOCAL_VISIBLE: { + if (F.Kind != MK_MainFile) + break; + unsigned Idx = 0; + GlobalDeclID ID = ReadDeclID(F, Record, Idx); + auto *Data = (const unsigned char *)Blob.data(); + TULocalUpdates[ID].push_back(UpdateData{&F, Data}); + // If we've already loaded the decl, perform the updates when we finish + // loading this block. + if (Decl *D = GetExistingDecl(ID)) + PendingUpdateRecords.push_back( + PendingUpdateRecord(ID, D, /*JustLoaded=*/false)); + break; + } + case CXX_ADDED_TEMPLATE_SPECIALIZATION: { unsigned Idx = 0; GlobalDeclID ID = ReadDeclID(F, Record, Idx); @@ -3717,6 +3751,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, TotalLexicalDeclContexts += Record[2]; TotalVisibleDeclContexts += Record[3]; TotalModuleLocalVisibleDeclContexts += Record[4]; + TotalTULocalVisibleDeclContexts += Record[5]; break; case UNUSED_FILESCOPED_DECLS: @@ -4002,7 +4037,7 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, break; case DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD: { - if (Record.size() % 4 != 0) + if (Record.size() % 5 != 0) return llvm::createStringError( std::errc::illegal_byte_sequence, "invalid DELAYED_NAMESPACE_LEXICAL_VISIBLE_RECORD block in AST " @@ -4021,9 +4056,12 @@ llvm::Error ASTReader::ReadASTBlock(ModuleFile &F, uint64_t LocalModuleLocalOffset = Record[I++]; uint64_t ModuleLocalOffset = LocalModuleLocalOffset ? BaseOffset + LocalModuleLocalOffset : 0; + uint64_t TULocalLocalOffset = Record[I++]; + uint64_t TULocalOffset = + TULocalLocalOffset ? BaseOffset + TULocalLocalOffset : 0; DelayedNamespaceOffsetMap[ID] = {LexicalOffset, VisibleOffset, - ModuleLocalOffset}; + ModuleLocalOffset, TULocalOffset}; assert(!GetExistingDecl(ID) && "We shouldn't load the namespace in the front of delayed " @@ -8473,6 +8511,15 @@ bool ASTReader::FindExternalVisibleDeclsByName(const DeclContext *DC, } } + if (auto It = TULocalLookups.find(DC); It != TULocalLookups.end()) { + ++NumTULocalVisibleDeclContexts; + for (GlobalDeclID ID : It->second.Table.find(Name)) { + NamedDecl *ND = cast(GetDecl(ID)); + if (ND->getDeclName() == Name && Found.insert(ND).second) + Decls.push_back(ND); + } + } + SetExternalVisibleDeclsForName(DC, Name, Decls); return !Decls.empty(); } @@ -8500,6 +8547,7 @@ void ASTReader::completeVisibleDeclsMap(const DeclContext *DC) { findAll(Lookups, NumVisibleDeclContextsRead); findAll(ModuleLocalLookups, NumModuleLocalVisibleDeclContexts); + findAll(TULocalLookups, NumTULocalVisibleDeclContexts); for (DeclsMap::iterator I = Decls.begin(), E = Decls.end(); I != E; ++I) { SetExternalVisibleDeclsForName(DC, I->first, I->second); @@ -8519,6 +8567,12 @@ ASTReader::getModuleLocalLookupTables(DeclContext *Primary) const { return I == ModuleLocalLookups.end() ? nullptr : &I->second; } +const serialization::reader::DeclContextLookupTable * +ASTReader::getTULocalLookupTables(DeclContext *Primary) const { + auto I = TULocalLookups.find(Primary); + return I == TULocalLookups.end() ? nullptr : &I->second; +} + serialization::reader::LazySpecializationInfoLookupTable * ASTReader::getLoadedSpecializationsLookupTables(const Decl *D, bool IsPartial) { assert(D->isCanonicalDecl()); @@ -8634,6 +8688,11 @@ void ASTReader::PrintStats() { NumModuleLocalVisibleDeclContexts, TotalModuleLocalVisibleDeclContexts, ((float)NumModuleLocalVisibleDeclContexts / TotalModuleLocalVisibleDeclContexts * 100)); + if (TotalTULocalVisibleDeclContexts) + std::fprintf(stderr, " %u/%u visible declcontexts in GMF read (%f%%)\n", + NumTULocalVisibleDeclContexts, TotalTULocalVisibleDeclContexts, + ((float)NumTULocalVisibleDeclContexts / + TotalTULocalVisibleDeclContexts * 100)); if (TotalNumMethodPoolEntries) std::fprintf(stderr, " %u/%u method pool entries read (%f%%)\n", NumMethodPoolEntriesRead, TotalNumMethodPoolEntries, @@ -10180,6 +10239,11 @@ void ASTReader::finishPendingActions() { } PendingDeducedVarTypes.clear(); + // Load the delayed preferred name attributes. + for (unsigned I = 0; I != PendingDeferredAttributes.size(); ++I) + loadDeferredAttribute(PendingDeferredAttributes[I]); + PendingDeferredAttributes.clear(); + // For each decl chain that we wanted to complete while deserializing, mark // it as "still needs to be completed". for (unsigned I = 0; I != PendingIncompleteDeclChains.size(); ++I) { diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 1c51a7b5e460f..de834285fa76b 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -414,7 +414,8 @@ class ASTDeclReader : public DeclVisitor { void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D); void VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset, - uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset); + uint64_t &VisibleOffset, uint64_t &ModuleLocalOffset, + uint64_t &TULocalOffset); template RedeclarableResult VisitRedeclarable(Redeclarable *D); @@ -612,7 +613,7 @@ void ASTDeclReader::VisitDecl(Decl *D) { if (HasAttrs) { AttrVec Attrs; - Record.readAttributes(Attrs); + Record.readAttributes(Attrs, D); // Avoid calling setAttrs() directly because it uses Decl::getASTContext() // internally which is unsafe during derialization. D->setAttrsImpl(Attrs, Reader.getContext()); @@ -1859,7 +1860,9 @@ void ASTDeclReader::VisitHLSLBufferDecl(HLSLBufferDecl *D) { uint64_t LexicalOffset = 0; uint64_t VisibleOffset = 0; uint64_t ModuleLocalOffset = 0; - VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset); + uint64_t TULocalOffset = 0; + VisitDeclContext(D, LexicalOffset, VisibleOffset, ModuleLocalOffset, + TULocalOffset); D->IsCBuffer = Record.readBool(); D->KwLoc = readSourceLocation(); D->LBraceLoc = readSourceLocation(); @@ -2770,10 +2773,12 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl( void ASTDeclReader::VisitDeclContext(DeclContext *DC, uint64_t &LexicalOffset, uint64_t &VisibleOffset, - uint64_t &ModuleLocalOffset) { + uint64_t &ModuleLocalOffset, + uint64_t &TULocalOffset) { LexicalOffset = ReadLocalOffset(); VisibleOffset = ReadLocalOffset(); ModuleLocalOffset = ReadLocalOffset(); + TULocalOffset = ReadLocalOffset(); } template @@ -3093,6 +3098,8 @@ class AttrReader { return Reader.readInt(); } + uint64_t peekInts(unsigned N) { return Reader.peekInts(N); } + bool readBool() { return Reader.readBool(); } SourceRange readSourceRange() { @@ -3123,18 +3130,29 @@ class AttrReader { return Reader.readVersionTuple(); } + void skipInt() { Reader.skipInts(1); } + + void skipInts(unsigned N) { Reader.skipInts(N); } + + unsigned getCurrentIdx() { return Reader.getIdx(); } + OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); } template T *readDeclAs() { return Reader.readDeclAs(); } }; } +/// Reads one attribute from the current stream position, advancing Idx. Attr *ASTRecordReader::readAttr() { AttrReader Record(*this); auto V = Record.readInt(); if (!V) return nullptr; + // Read and ignore the skip count, since attribute deserialization is not + // deferred on this pass. + Record.skipInt(); + Attr *New = nullptr; // Kind is stored as a 1-based integer because 0 is used to indicate a null // Attr pointer. @@ -3164,13 +3182,28 @@ Attr *ASTRecordReader::readAttr() { return New; } -/// Reads attributes from the current stream position. -void ASTRecordReader::readAttributes(AttrVec &Attrs) { +/// Reads attributes from the current stream position, advancing Idx. +/// For some attributes (where type depends on itself recursively), defer +/// reading the attribute until the type has been read. +void ASTRecordReader::readAttributes(AttrVec &Attrs, Decl *D) { for (unsigned I = 0, E = readInt(); I != E; ++I) - if (auto *A = readAttr()) + if (auto *A = readOrDeferAttrFor(D)) Attrs.push_back(A); } +/// Reads one attribute from the current stream position, advancing Idx. +/// For some attributes (where type depends on itself recursively), defer +/// reading the attribute until the type has been read. +Attr *ASTRecordReader::readOrDeferAttrFor(Decl *D) { + AttrReader Record(*this); + unsigned SkipCount = Record.peekInts(1); + if (!SkipCount) + return readAttr(); + Reader->PendingDeferredAttributes.push_back({Record.getCurrentIdx(), D}); + Record.skipInts(SkipCount); + return nullptr; +} + //===----------------------------------------------------------------------===// // ASTReader Implementation //===----------------------------------------------------------------------===// @@ -3875,6 +3908,7 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { case DECL_CONTEXT_LEXICAL: case DECL_CONTEXT_VISIBLE: case DECL_CONTEXT_MODULE_LOCAL_VISIBLE: + case DECL_CONTEXT_TU_LOCAL_VISIBLE: case DECL_SPECIALIZATIONS: case DECL_PARTIAL_SPECIALIZATIONS: llvm_unreachable("Record cannot be de-serialized with readDeclRecord"); @@ -4185,9 +4219,10 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { uint64_t LexicalOffset = 0; uint64_t VisibleOffset = 0; uint64_t ModuleLocalOffset = 0; + uint64_t TULocalOffset = 0; - Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, - ModuleLocalOffset); + Reader.VisitDeclContext(DC, LexicalOffset, VisibleOffset, ModuleLocalOffset, + TULocalOffset); // Get the lexical and visible block for the delayed namespace. // It is sufficient to judge if ID is in DelayedNamespaceOffsetMap. @@ -4199,18 +4234,24 @@ Decl *ASTReader::ReadDeclRecord(GlobalDeclID ID) { LexicalOffset = Iter->second.LexicalOffset; VisibleOffset = Iter->second.VisibleOffset; ModuleLocalOffset = Iter->second.ModuleLocalOffset; + TULocalOffset = Iter->second.TULocalOffset; } if (LexicalOffset && ReadLexicalDeclContextStorage(*Loc.F, DeclsCursor, LexicalOffset, DC)) return nullptr; - if (VisibleOffset && - ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, VisibleOffset, ID, - /*IsModuleLocal=*/false)) + if (VisibleOffset && ReadVisibleDeclContextStorage( + *Loc.F, DeclsCursor, VisibleOffset, ID, + VisibleDeclContextStorageKind::GenerallyVisible)) return nullptr; if (ModuleLocalOffset && - ReadVisibleDeclContextStorage(*Loc.F, DeclsCursor, ModuleLocalOffset, - ID, /*IsModuleLocal=*/true)) + ReadVisibleDeclContextStorage( + *Loc.F, DeclsCursor, ModuleLocalOffset, ID, + VisibleDeclContextStorageKind::ModuleLocalVisible)) + return nullptr; + if (TULocalOffset && ReadVisibleDeclContextStorage( + *Loc.F, DeclsCursor, TULocalOffset, ID, + VisibleDeclContextStorageKind::TULocalVisible)) return nullptr; } assert(Record.getIdx() == Record.size()); @@ -4376,6 +4417,18 @@ void ASTReader::loadDeclUpdateRecords(PendingUpdateRecord &Record) { DC->setHasExternalVisibleStorage(true); } + if (auto I = TULocalUpdates.find(ID); I != TULocalUpdates.end()) { + auto Updates = std::move(I->second); + TULocalUpdates.erase(I); + + auto *DC = cast(D)->getPrimaryContext(); + for (const auto &Update : Updates) + TULocalLookups[DC].Table.add( + Update.Mod, Update.Data, + reader::ASTDeclContextNameLookupTrait(*this, *Update.Mod)); + DC->setHasExternalVisibleStorage(true); + } + // Load any pending related decls. if (D->isCanonicalDecl()) { if (auto IT = RelatedDeclsMap.find(ID); IT != RelatedDeclsMap.end()) { @@ -4459,6 +4512,49 @@ void ASTReader::loadPendingDeclChain(Decl *FirstLocal, uint64_t LocalOffset) { ASTDeclReader::attachLatestDecl(CanonDecl, MostRecent); } +void ASTReader::loadDeferredAttribute(const DeferredAttribute &DA) { + Decl *D = DA.TargetedDecl; + ModuleFile *M = getOwningModuleFile(D); + + unsigned LocalDeclIndex = D->getGlobalID().getLocalDeclIndex(); + const DeclOffset &DOffs = M->DeclOffsets[LocalDeclIndex]; + RecordLocation Loc(M, DOffs.getBitOffset(M->DeclsBlockStartOffset)); + + llvm::BitstreamCursor &Cursor = Loc.F->DeclsCursor; + SavedStreamPosition SavedPosition(Cursor); + if (llvm::Error Err = Cursor.JumpToBit(Loc.Offset)) { + Error(std::move(Err)); + } + + Expected MaybeCode = Cursor.ReadCode(); + if (!MaybeCode) { + llvm::report_fatal_error( + Twine("ASTReader::loadPreferredNameAttribute failed reading code: ") + + toString(MaybeCode.takeError())); + } + unsigned Code = MaybeCode.get(); + + ASTRecordReader Record(*this, *Loc.F); + Expected MaybeRecCode = Record.readRecord(Cursor, Code); + if (!MaybeRecCode) { + llvm::report_fatal_error( + Twine( + "ASTReader::loadPreferredNameAttribute failed reading rec code: ") + + toString(MaybeCode.takeError())); + } + unsigned RecCode = MaybeRecCode.get(); + if (RecCode < DECL_TYPEDEF || RecCode > DECL_LAST) { + llvm::report_fatal_error( + Twine("ASTReader::loadPreferredNameAttribute failed reading rec code: " + "expected valid DeclCode") + + toString(MaybeCode.takeError())); + } + + Record.skipInts(DA.RecordIdx); + Attr *A = Record.readAttr(); + getContext().getDeclAttrs(D).push_back(A); +} + namespace { /// Given an ObjC interface, goes through the modules and links to the diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 55d3c2bb56f2c..c7c17e09a30e0 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -37,6 +37,7 @@ #include "clang/AST/Type.h" #include "clang/AST/TypeLoc.h" #include "clang/AST/TypeLocVisitor.h" +#include "clang/Basic/AttrKinds.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticOptions.h" #include "clang/Basic/FileEntry.h" @@ -4046,6 +4047,13 @@ class ASTDeclContextNameLookupTraitBase { : Writer(Writer) {} public: + data_type getData(const DeclIDsTy &LocalIDs) { + unsigned Start = DeclIDs.size(); + for (auto ID : LocalIDs) + DeclIDs.push_back(ID); + return std::make_pair(Start, DeclIDs.size()); + } + data_type ImportData(const reader::ASTDeclContextNameLookupTrait::data_type &FromReader) { unsigned Start = DeclIDs.size(); DeclIDs.insert( @@ -4138,23 +4146,16 @@ class ASTDeclContextNameLookupTraitBase { } }; -class ModuleLocalNameLookupTrait : public ASTDeclContextNameLookupTraitBase { +class ModuleLevelNameLookupTrait : public ASTDeclContextNameLookupTraitBase { public: using primary_module_hash_type = unsigned; using key_type = std::pair; using key_type_ref = key_type; - explicit ModuleLocalNameLookupTrait(ASTWriter &Writer) + explicit ModuleLevelNameLookupTrait(ASTWriter &Writer) : ASTDeclContextNameLookupTraitBase(Writer) {} - data_type getData(const DeclIDsTy &LocalIDs) { - unsigned Start = DeclIDs.size(); - for (auto ID : LocalIDs) - DeclIDs.push_back(ID); - return std::make_pair(Start, DeclIDs.size()); - } - static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; } hash_value_type ComputeHash(key_type Key) { @@ -4202,19 +4203,46 @@ static bool isModuleLocalDecl(NamedDecl *D) { return false; } +static bool isTULocalInNamedModules(NamedDecl *D) { + Module *NamedModule = D->getTopLevelOwningNamedModule(); + if (!NamedModule) + return false; + + // For none-top level decls, we choose to move it to the general visible + // lookup table. Since the consumer may get its parent somehow and performs + // a lookup in it (considering looking up the operator function in lambda). + // The difference between module local lookup table and TU local lookup table + // is, the consumers still have a chance to lookup in the module local lookup + // table but **now** the consumers won't read the TU local lookup table if + // the consumer is not the original TU. + // + // FIXME: It seems to be an optimization chance (and also a more correct + // semantics) to remain the TULocal lookup table and performing similar lookup + // with the module local lookup table except that we only allow the lookups + // with the same module unit. + if (!D->getNonTransparentDeclContext()->isFileContext()) + return false; + + return D->getLinkageInternal() == Linkage::Internal; +} + // Trait used for the on-disk hash table used in the method pool. +template class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { public: - using ModuleLocalDeclsMapTy = - llvm::DenseMap; - -private: - ModuleLocalDeclsMapTy ModuleLocalDeclsMap; + using ModuleLevelDeclsMapTy = + llvm::DenseMap; -public: using key_type = DeclarationNameKey; using key_type_ref = key_type; + using TULocalDeclsMapTy = llvm::DenseMap; + +private: + ModuleLevelDeclsMapTy ModuleLocalDeclsMap; + TULocalDeclsMapTy TULocalDeclsMap; + +public: explicit ASTDeclContextNameLookupTrait(ASTWriter &Writer) : ASTDeclContextNameLookupTraitBase(Writer) {} @@ -4250,15 +4278,30 @@ class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { } } + if constexpr (CollectingTULocalDecls) { + if (isTULocalInNamedModules(D)) { + auto Iter = TULocalDeclsMap.find(D->getDeclName()); + if (Iter == TULocalDeclsMap.end()) + TULocalDeclsMap.insert({D->getDeclName(), DeclIDsTy{ID}}); + else + Iter->second.push_back(ID); + continue; + } + } + DeclIDs.push_back(ID); } return std::make_pair(Start, DeclIDs.size()); } - const ModuleLocalDeclsMapTy &getModuleLocalDecls() { + using ASTDeclContextNameLookupTraitBase::getData; + + const ModuleLevelDeclsMapTy &getModuleLocalDecls() { return ModuleLocalDeclsMap; } + const TULocalDeclsMapTy &getTULocalDecls() { return TULocalDeclsMap; } + static bool EqualKey(key_type_ref a, key_type_ref b) { return a == b; } hash_value_type ComputeHash(key_type Name) { return Name.getHash(); } @@ -4486,7 +4529,8 @@ static bool isLookupResultNotInteresting(ASTWriter &Writer, void ASTWriter::GenerateNameLookupTable( ASTContext &Context, const DeclContext *ConstDC, llvm::SmallVectorImpl &LookupTable, - llvm::SmallVectorImpl &ModuleLocalLookupTable) { + llvm::SmallVectorImpl &ModuleLocalLookupTable, + llvm::SmallVectorImpl &TULookupTable) { assert(!ConstDC->hasLazyLocalLexicalLookups() && !ConstDC->hasLazyExternalLexicalLookups() && "must call buildLookups first"); @@ -4496,9 +4540,11 @@ void ASTWriter::GenerateNameLookupTable( assert(DC == DC->getPrimaryContext() && "only primary DC has lookup table"); // Create the on-disk hash table representation. - MultiOnDiskHashTableGenerator Generator; - ASTDeclContextNameLookupTrait Trait(*this); + MultiOnDiskHashTableGenerator< + reader::ASTDeclContextNameLookupTrait, + ASTDeclContextNameLookupTrait> + Generator; + ASTDeclContextNameLookupTrait Trait(*this); // The first step is to collect the declaration names which we need to // serialize into the name lookup table, and to collect them in a stable @@ -4670,26 +4716,45 @@ void ASTWriter::GenerateNameLookupTable( Generator.emit(LookupTable, Trait, Lookups ? &Lookups->Table : nullptr); const auto &ModuleLocalDecls = Trait.getModuleLocalDecls(); - if (ModuleLocalDecls.empty()) - return; + if (!ModuleLocalDecls.empty()) { + MultiOnDiskHashTableGenerator + ModuleLocalLookupGenerator; + ModuleLevelNameLookupTrait ModuleLocalTrait(*this); + + for (const auto &ModuleLocalIter : ModuleLocalDecls) { + const auto &Key = ModuleLocalIter.first; + const auto &IDs = ModuleLocalIter.second; + ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs), + ModuleLocalTrait); + } - MultiOnDiskHashTableGenerator - ModuleLocalLookupGenerator; - ModuleLocalNameLookupTrait ModuleLocalTrait(*this); + auto *ModuleLocalLookups = + Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr; + ModuleLocalLookupGenerator.emit( + ModuleLocalLookupTable, ModuleLocalTrait, + ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr); + } + + const auto &TULocalDecls = Trait.getTULocalDecls(); + if (!TULocalDecls.empty() && !isGeneratingReducedBMI()) { + MultiOnDiskHashTableGenerator< + reader::ASTDeclContextNameLookupTrait, + ASTDeclContextNameLookupTrait> + TULookupGenerator; + ASTDeclContextNameLookupTrait TULocalTrait( + *this); + + for (const auto &TULocalIter : TULocalDecls) { + const auto &Key = TULocalIter.first; + const auto &IDs = TULocalIter.second; + TULookupGenerator.insert(Key, TULocalTrait.getData(IDs), TULocalTrait); + } - for (const auto &ModuleLocalIter : ModuleLocalDecls) { - const auto &Key = ModuleLocalIter.first; - const auto &IDs = ModuleLocalIter.second; - ModuleLocalLookupGenerator.insert(Key, ModuleLocalTrait.getData(IDs), - ModuleLocalTrait); + auto *TULocalLookups = Chain ? Chain->getTULocalLookupTables(DC) : nullptr; + TULookupGenerator.emit(TULookupTable, TULocalTrait, + TULocalLookups ? &TULocalLookups->Table : nullptr); } - - auto *ModuleLocalLookups = - Chain ? Chain->getModuleLocalLookupTables(DC) : nullptr; - ModuleLocalLookupGenerator.emit( - ModuleLocalLookupTable, ModuleLocalTrait, - ModuleLocalLookups ? &ModuleLocalLookups->Table : nullptr); } /// Write the block containing all of the declaration IDs @@ -4700,7 +4765,12 @@ void ASTWriter::GenerateNameLookupTable( void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, DeclContext *DC, uint64_t &VisibleBlockOffset, - uint64_t &ModuleLocalBlockOffset) { + uint64_t &ModuleLocalBlockOffset, + uint64_t &TULocalBlockOffset) { + assert(VisibleBlockOffset == 0); + assert(ModuleLocalBlockOffset == 0); + assert(TULocalBlockOffset == 0); + // If we imported a key declaration of this namespace, write the visible // lookup results as an update record for it rather than including them // on this declaration. We will only look at key declarations on reload. @@ -4787,7 +4857,9 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, // Create the on-disk hash table in a buffer. SmallString<4096> LookupTable; SmallString<4096> ModuleLocalLookupTable; - GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable); + SmallString<4096> TULookupTable; + GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable, + TULookupTable); // Write the lookup table RecordData::value_type Record[] = {DECL_CONTEXT_VISIBLE}; @@ -4795,17 +4867,26 @@ void ASTWriter::WriteDeclContextVisibleBlock(ASTContext &Context, LookupTable); ++NumVisibleDeclContexts; - if (ModuleLocalLookupTable.empty()) - return; + if (!ModuleLocalLookupTable.empty()) { + ModuleLocalBlockOffset = Stream.GetCurrentBitNo(); + assert(ModuleLocalBlockOffset > VisibleBlockOffset); + // Write the lookup table + RecordData::value_type ModuleLocalRecord[] = { + DECL_CONTEXT_MODULE_LOCAL_VISIBLE}; + Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev, + ModuleLocalRecord, ModuleLocalLookupTable); + ++NumModuleLocalDeclContexts; + } - ModuleLocalBlockOffset = Stream.GetCurrentBitNo(); - assert(ModuleLocalBlockOffset > VisibleBlockOffset); - // Write the lookup table - RecordData::value_type ModuleLocalRecord[] = { - DECL_CONTEXT_MODULE_LOCAL_VISIBLE}; - Stream.EmitRecordWithBlob(DeclModuleLocalVisibleLookupAbbrev, - ModuleLocalRecord, ModuleLocalLookupTable); - ++NumModuleLocalDeclContexts; + if (!TULookupTable.empty()) { + TULocalBlockOffset = Stream.GetCurrentBitNo(); + // Write the lookup table + RecordData::value_type TULocalDeclsRecord[] = { + DECL_CONTEXT_TU_LOCAL_VISIBLE}; + Stream.EmitRecordWithBlob(DeclTULocalLookupAbbrev, TULocalDeclsRecord, + TULookupTable); + ++NumTULocalDeclContexts; + } } /// Write an UPDATE_VISIBLE block for the given context. @@ -4823,7 +4904,9 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context, // Create the on-disk hash table in a buffer. SmallString<4096> LookupTable; SmallString<4096> ModuleLocalLookupTable; - GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable); + SmallString<4096> TULookupTable; + GenerateNameLookupTable(Context, DC, LookupTable, ModuleLocalLookupTable, + TULookupTable); // If we're updating a namespace, select a key declaration as the key for the // update record; those are the only ones that will be checked on reload. @@ -4835,14 +4918,20 @@ void ASTWriter::WriteDeclContextVisibleUpdate(ASTContext &Context, getDeclID(cast(DC)).getRawValue()}; Stream.EmitRecordWithBlob(UpdateVisibleAbbrev, Record, LookupTable); - if (ModuleLocalLookupTable.empty()) - return; + if (!ModuleLocalLookupTable.empty()) { + // Write the module local lookup table + RecordData::value_type ModuleLocalRecord[] = { + UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast(DC)).getRawValue()}; + Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord, + ModuleLocalLookupTable); + } - // Write the module local lookup table - RecordData::value_type ModuleLocalRecord[] = { - UPDATE_MODULE_LOCAL_VISIBLE, getDeclID(cast(DC)).getRawValue()}; - Stream.EmitRecordWithBlob(ModuleLocalUpdateVisibleAbbrev, ModuleLocalRecord, - ModuleLocalLookupTable); + if (!TULookupTable.empty()) { + RecordData::value_type GMFRecord[] = { + UPDATE_TU_LOCAL_VISIBLE, getDeclID(cast(DC)).getRawValue()}; + Stream.EmitRecordWithBlob(TULocalUpdateVisibleAbbrev, GMFRecord, + TULookupTable); + } } /// Write an FP_PRAGMA_OPTIONS block for the given FPOptions. @@ -5067,15 +5156,14 @@ void ASTWriter::WriteModuleFileExtension(Sema &SemaRef, void ASTRecordWriter::AddAttr(const Attr *A) { auto &Record = *this; - // FIXME: Clang can't handle the serialization/deserialization of - // preferred_name properly now. See - // https://github.com/llvm/llvm-project/issues/56490 for example. - if (!A || (isa(A) && - Writer->isWritingStdCXXNamedModules())) + if (!A) return Record.push_back(0); Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs + auto SkipIdx = Record.size(); + // Add placeholder for the size of deferred attribute. + Record.push_back(0); Record.AddIdentifierRef(A->getAttrName()); Record.AddIdentifierRef(A->getScopeName()); Record.AddSourceRange(A->getRange()); @@ -5086,6 +5174,12 @@ void ASTRecordWriter::AddAttr(const Attr *A) { Record.push_back(A->isRegularKeywordAttribute()); #include "clang/Serialization/AttrPCHWrite.inc" + + if (A->shouldDeferDeserialization()) { + // Record the actual size of deferred attribute (+ 1 to count the attribute + // kind). + Record[SkipIdx] = Record.size() - SkipIdx + 1; + } } /// Emit the list of attributes to the specified record. @@ -6025,9 +6119,12 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema *SemaPtr, StringRef isysroot, } // Some simple statistics - RecordData::value_type Record[] = { - NumStatements, NumMacros, NumLexicalDeclContexts, NumVisibleDeclContexts, - NumModuleLocalDeclContexts}; + RecordData::value_type Record[] = {NumStatements, + NumMacros, + NumLexicalDeclContexts, + NumVisibleDeclContexts, + NumModuleLocalDeclContexts, + NumTULocalDeclContexts}; Stream.EmitRecord(STATISTICS, Record); Stream.ExitBlock(); Stream.FlushToWord(); @@ -6106,7 +6203,9 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { uint64_t LexicalOffset = WriteDeclContextLexicalBlock(Context, NS); uint64_t VisibleOffset = 0; uint64_t ModuleLocalOffset = 0; - WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset); + uint64_t TULocalOffset = 0; + WriteDeclContextVisibleBlock(Context, NS, VisibleOffset, ModuleLocalOffset, + TULocalOffset); // Write the offset relative to current block. if (LexicalOffset) @@ -6118,10 +6217,14 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { if (ModuleLocalOffset) ModuleLocalOffset -= DeclTypesBlockStartOffset; + if (TULocalOffset) + TULocalOffset -= DeclTypesBlockStartOffset; + AddDeclRef(NS, DelayedNamespaceRecord); DelayedNamespaceRecord.push_back(LexicalOffset); DelayedNamespaceRecord.push_back(VisibleOffset); DelayedNamespaceRecord.push_back(ModuleLocalOffset); + DelayedNamespaceRecord.push_back(TULocalOffset); } // The process of writing lexical and visible block for delayed namespace @@ -6207,6 +6310,12 @@ void ASTWriter::WriteDeclAndTypes(ASTContext &Context) { Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob)); ModuleLocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); + Abv->Add(llvm::BitCodeAbbrevOp(UPDATE_TU_LOCAL_VISIBLE)); + Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::VBR, 6)); + Abv->Add(llvm::BitCodeAbbrevOp(llvm::BitCodeAbbrevOp::Blob)); + TULocalUpdateVisibleAbbrev = Stream.EmitAbbrev(std::move(Abv)); + // And a visible updates block for the translation unit. WriteDeclContextVisibleUpdate(Context, TU); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 7a494cfe1ac64..30b28057f4c10 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2069,6 +2069,7 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) { uint64_t LexicalOffset = 0; uint64_t VisibleOffset = 0; uint64_t ModuleLocalOffset = 0; + uint64_t TULocalOffset = 0; if (Writer.isGeneratingReducedBMI() && isa(DC) && cast(DC)->isFromExplicitGlobalModule()) { @@ -2080,12 +2081,14 @@ void ASTDeclWriter::VisitDeclContext(DeclContext *DC) { LexicalOffset = Writer.WriteDeclContextLexicalBlock(Record.getASTContext(), DC); Writer.WriteDeclContextVisibleBlock(Record.getASTContext(), DC, - VisibleOffset, ModuleLocalOffset); + VisibleOffset, ModuleLocalOffset, + TULocalOffset); } Record.AddOffset(LexicalOffset); Record.AddOffset(VisibleOffset); Record.AddOffset(ModuleLocalOffset); + Record.AddOffset(TULocalOffset); } const Decl *ASTWriter::getFirstLocalDecl(const Decl *D) { @@ -2441,6 +2444,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LexicalOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // VisibleOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ModuleLocalOffset + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TULocalOffset DeclEnumAbbrev = Stream.EmitAbbrev(std::move(Abv)); // Abbreviation for DECL_RECORD @@ -2494,6 +2498,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LexicalOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // VisibleOffset Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ModuleLocalOffset + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TULocalOffset DeclRecordAbbrev = Stream.EmitAbbrev(std::move(Abv)); // Abbreviation for DECL_PARM_VAR @@ -2836,6 +2841,11 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); DeclModuleLocalVisibleLookupAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); + Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_TU_LOCAL_VISIBLE)); + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); + DeclTULocalLookupAbbrev = Stream.EmitAbbrev(std::move(Abv)); + Abv = std::make_shared(); Abv->Add(BitCodeAbbrevOp(serialization::DECL_SPECIALIZATIONS)); Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp index 268362ceff635..268226a7c143e 100644 --- a/clang/test/AST/ByteCode/cxx20.cpp +++ b/clang/test/AST/ByteCode/cxx20.cpp @@ -893,3 +893,18 @@ namespace VirtDtor { static_assert(test('C', 'B')); } + +namespace TemporaryInNTTP { + template struct B { /* ... */ }; + struct J1 { + J1 *self=this; + }; + /// FIXME: The bytecode interpreter emits a different diagnostic here. + /// The current interpreter creates a fake MaterializeTemporaryExpr (see EvaluateAsConstantExpr) + /// which is later used as the LValueBase of the created APValue. + B j1; // ref-error {{pointer to temporary object is not allowed in a template argument}} \ + // expected-error {{non-type template argument is not a constant expression}} \ + // expected-note {{pointer to temporary is not a constant expression}} \ + // expected-note {{created here}} + B<2> j2; /// Ok. +} diff --git a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp index a27946bd90a46..c200abafc0af8 100644 --- a/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp +++ b/clang/test/CXX/basic/basic.lookup/basic.lookup.argdep/p5-ex2.cpp @@ -61,6 +61,6 @@ void test() { // error: S::f is visible in instantiation context, but R::g has internal // linkage and cannot be used outside N.cpp - apply(x, S::Z()); // expected-error@N.cpp:10 {{no matching function for call to 'g'}} - // expected-note@-1 {{in instantiation of function template specialization 'apply' requested here}} + apply(x, S::Z()); // expected-error@N.cpp:10 {{use of undeclared identifier 'g'}} + // expected-note@-1 {{requested here}} } diff --git a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp index 54ec6aa61ec37..d70eb7de22c6a 100644 --- a/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp +++ b/clang/test/CXX/basic/basic.scope/basic.scope.namespace/p2.cpp @@ -66,11 +66,7 @@ void test_late() { // expected-note@p2.cpp:18 {{'exported' declared here}} #endif - internal = 1; -#ifndef IMPLEMENTATION - // expected-error@-2 {{declaration of 'internal' must be imported from module 'A' before it is required}} - // expected-note@p2.cpp:20 {{declaration here is not visible}} -#endif + internal = 1; // expected-error {{use of undeclared identifier 'internal'}} not_exported_private = 1; #ifndef IMPLEMENTATION @@ -78,11 +74,7 @@ void test_late() { // expected-error@-3 {{undeclared identifier}} #endif - internal_private = 1; -#ifndef IMPLEMENTATION - // FIXME: should not be visible here - // expected-error@-3 {{undeclared identifier}} -#endif + internal_private = 1; // expected-error {{use of undeclared identifier 'internal_private'}} } #endif diff --git a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm index 487dbdef283ee..7e88cbe78b4e3 100644 --- a/clang/test/CXX/module/basic/basic.def.odr/p4.cppm +++ b/clang/test/CXX/module/basic/basic.def.odr/p4.cppm @@ -128,7 +128,6 @@ void f(a::b, a::c) {} // // CHECK-DAG: @_ZW6Module25extern_var_module_linkage = external {{(dso_local )?}}global // CHECK-DAG: @_ZW6Module25inline_var_module_linkage = linkonce_odr {{(dso_local )?}}global -// CHECK-DAG: @_ZL25static_var_module_linkage = internal {{(dso_local )?}}global i32 0, // CHECK-DAG: @_ZW6Module24const_var_module_linkage = available_externally {{(dso_local )?}}constant i32 3, module Module; @@ -152,10 +151,6 @@ void use() { (void)&extern_var_module_linkage; (void)&inline_var_module_linkage; - // FIXME: Issue #61427 Internal-linkage declarations in the interface TU - // should not be not visible here. - (void)&static_var_module_linkage; // FIXME: Should not be visible here. - (void)&const_var_module_linkage; // FIXME: will be visible after P2788R0 } diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm index 5a497304201dc..d7d2b5992a235 100644 --- a/clang/test/CXX/module/basic/basic.link/p2.cppm +++ b/clang/test/CXX/module/basic/basic.link/p2.cppm @@ -45,16 +45,14 @@ module M; void use_from_module_impl() { external_linkage_fn(); module_linkage_fn(); - internal_linkage_fn(); // expected-error {{no matching function for call to 'internal_linkage_fn'}} + internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}} // expected-note@* {{}} (void)external_linkage_class{}; (void)module_linkage_class{}; (void)external_linkage_var; (void)module_linkage_var; - // FIXME: Issue #61427 Internal-linkage declarations in the interface TU - // should not be not visible here. - (void)internal_linkage_class{}; - (void)internal_linkage_var; + (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} //expected-error{{}} + (void)internal_linkage_var; // expected-error {{use of undeclared identifier 'internal_linkage_var'}} } //--- user.cpp @@ -63,11 +61,10 @@ import M; void use_from_module_impl() { external_linkage_fn(); module_linkage_fn(); // expected-error {{use of undeclared identifier 'module_linkage_fn'}} - internal_linkage_fn(); // expected-error {{declaration of 'internal_linkage_fn' must be imported}} + internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}} (void)external_linkage_class{}; - (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} + (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}} (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} - // expected-note@M.cppm:10 {{declaration here is not visible}} (void)external_linkage_var; (void)module_linkage_var; // expected-error {{undeclared identifier}} (void)internal_linkage_var; // expected-error {{undeclared identifier}} diff --git a/clang/test/Modules/gmodules-nodebug.cpp b/clang/test/Modules/gmodules-nodebug.cpp new file mode 100644 index 0000000000000..d83103768e838 --- /dev/null +++ b/clang/test/Modules/gmodules-nodebug.cpp @@ -0,0 +1,14 @@ +// REQUIRES: asserts + +// RUN: %clang_cc1 -std=c++23 -x c++-header -emit-pch -fmodule-format=obj \ +// RUN: -o %t.pch %s \ +// RUN: -mllvm -debug-only=pchcontainer &>%t-pch.ll +// RUN: cat %t-pch.ll | FileCheck %s + +template +using __void_t [[gnu::nodebug]] = void; + +__void_t<> func() {} + +// CHECK: !DICompileUnit +// CHECK-NOT: __void_t diff --git a/clang/test/Modules/preferred_name.cppm b/clang/test/Modules/preferred_name.cppm index 806781a81c5ca..86ba6ae96db99 100644 --- a/clang/test/Modules/preferred_name.cppm +++ b/clang/test/Modules/preferred_name.cppm @@ -53,10 +53,16 @@ import A; export using ::foo_templ; //--- Use1.cpp -import A; // expected-warning@foo.h:8 {{attribute declaration must precede definition}} -#include "foo.h" // expected-note@foo.h:9 {{previous definition is here}} - +// expected-no-diagnostics +import A; +#include "foo.h" //--- Use2.cpp // expected-no-diagnostics #include "foo.h" import A; + +//--- Use3.cpp +#include "foo.h" +import A; +foo test; +int size = test.size(); // expected-error {{no member named 'size' in 'foo'}} diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp index eb90f082437b3..c3a3002889c73 100644 --- a/clang/tools/diagtool/DiagnosticNames.cpp +++ b/clang/tools/diagtool/DiagnosticNames.cpp @@ -23,26 +23,13 @@ llvm::ArrayRef diagtool::getBuiltinDiagnosticsByName() { return llvm::ArrayRef(BuiltinDiagnosticsByName); } - // FIXME: Is it worth having two tables, especially when this one can get // out of sync easily? static const DiagnosticRecord BuiltinDiagnosticsByID[] = { #define DIAG(ENUM, CLASS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFER, CATEGORY) \ {#ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t)}, -#include "clang/Basic/DiagnosticCommonKinds.inc" -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#include "clang/Basic/DiagnosticDriverKinds.inc" -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#include "clang/Basic/DiagnosticLexKinds.inc" -#include "clang/Basic/DiagnosticParseKinds.inc" -#include "clang/Basic/DiagnosticASTKinds.inc" -#include "clang/Basic/DiagnosticCommentKinds.inc" -#include "clang/Basic/DiagnosticSemaKinds.inc" -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" +#include "clang/Basic/AllDiagnosticKinds.inc" #undef DIAG }; @@ -54,6 +41,13 @@ static bool orderByID(const DiagnosticRecord &Left, const DiagnosticRecord &diagtool::getDiagnosticForID(short DiagID) { DiagnosticRecord Key = {nullptr, DiagID, 0}; + // The requirement for lower_bound to produce a valid result it is + // enough if the BuiltinDiagnosticsByID is partitioned (by DiagID), + // but as we want this function to work for all possible values of + // DiagID sent in as argument it is better to right away check if + // BuiltinDiagnosticsByID is sorted. + assert(llvm::is_sorted(BuiltinDiagnosticsByID, orderByID) && + "IDs in BuiltinDiagnosticsByID must be sorted."); const DiagnosticRecord *Result = llvm::lower_bound(BuiltinDiagnosticsByID, Key, orderByID); assert(Result && "diagnostic not found; table may be out of date"); diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index cc6a8eaebd44e..41730eba32ce2 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -3043,6 +3043,10 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS, << (R.getValueAsBit("InheritEvenIfAlreadyPresent") ? "true" : "false"); } + if (R.getValueAsBit("DeferDeserialization")) { + OS << ", " + << "/*DeferDeserialization=*/true"; + } OS << ")\n"; for (auto const &ai : Args) { diff --git a/flang/test/Lower/module_use.f90 b/flang/test/Lower/module_use.f90 index ad43865470b68..b976663239ef5 100644 --- a/flang/test/Lower/module_use.f90 +++ b/flang/test/Lower/module_use.f90 @@ -1,5 +1,6 @@ -! RUN: bbc -emit-fir %S/module_definition.f90 -! RUN: bbc -emit-fir %s -o - | FileCheck %s +! RUN: rm -fr %t && mkdir -p %t +! RUN: bbc -emit-fir -module %t %S/module_definition.f90 +! RUN: bbc -emit-fir -J %t %s -o - | FileCheck %s ! Test use of module data not defined in this file. ! The modules are defined in module_definition.f90 diff --git a/libclc/clc/include/clc/common/clc_degrees.h b/libclc/clc/include/clc/common/clc_degrees.h new file mode 100644 index 0000000000000..e8bb684fcd4d7 --- /dev/null +++ b/libclc/clc/include/clc/common/clc_degrees.h @@ -0,0 +1,12 @@ +#ifndef __CLC_MATH_CLC_DEGREES_H__ +#define __CLC_MATH_CLC_DEGREES_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_degrees + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_DEGREES_H__ diff --git a/libclc/clc/include/clc/common/clc_radians.h b/libclc/clc/include/clc/common/clc_radians.h new file mode 100644 index 0000000000000..80d481e8de723 --- /dev/null +++ b/libclc/clc/include/clc/common/clc_radians.h @@ -0,0 +1,12 @@ +#ifndef __CLC_MATH_CLC_RADIANS_H__ +#define __CLC_MATH_CLC_RADIANS_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_radians + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_RADIANS_H__ diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index f3097de694422..d74bff20ba87b 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -1,3 +1,5 @@ +common/clc_degrees.cl +common/clc_radians.cl common/clc_smoothstep.cl geometric/clc_dot.cl integer/clc_abs.cl diff --git a/libclc/clc/lib/generic/common/clc_degrees.cl b/libclc/clc/lib/generic/common/clc_degrees.cl new file mode 100644 index 0000000000000..ce705982072e8 --- /dev/null +++ b/libclc/clc/lib/generic/common/clc_degrees.cl @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include + +#define DEGREES_SINGLE_DEF(TYPE, LITERAL) \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_degrees(TYPE radians) { \ + return (TYPE)LITERAL * radians; \ + } + +#define DEGREES_DEF(TYPE, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##2, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##3, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##4, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##8, LITERAL) \ + DEGREES_SINGLE_DEF(TYPE##16, LITERAL) + +// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F +DEGREES_DEF(float, 0x1.ca5dc2p+5F) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F +DEGREES_DEF(double, 0x1.ca5dc1a63c1f8p+5) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +// 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F +DEGREES_DEF(half, (half)0x1.ca5dc1a63c1f8p+5) + +#endif diff --git a/libclc/clc/lib/generic/common/clc_radians.cl b/libclc/clc/lib/generic/common/clc_radians.cl new file mode 100644 index 0000000000000..850b8eb84f9da --- /dev/null +++ b/libclc/clc/lib/generic/common/clc_radians.cl @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2014,2015 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include +#include + +#define RADIANS_SINGLE_DEF(TYPE, LITERAL) \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_radians(TYPE radians) { \ + return (TYPE)LITERAL * radians; \ + } + +#define RADIANS_DEF(TYPE, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##2, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##3, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##4, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##8, LITERAL) \ + RADIANS_SINGLE_DEF(TYPE##16, LITERAL) + +// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F +RADIANS_DEF(float, 0x1.1df46ap-6F) + +#ifdef cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F +RADIANS_DEF(double, 0x1.1df46a2529d39p-6) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +// pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F +RADIANS_DEF(half, (half)0x1.1df46a2529d39p-6) + +#endif diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES index 02784b8def682..ac855ea5184ed 100644 --- a/libclc/clc/lib/spirv/SOURCES +++ b/libclc/clc/lib/spirv/SOURCES @@ -1,3 +1,5 @@ +../generic/common/clc_degrees.cl +../generic/common/clc_radians.cl ../generic/common/clc_smoothstep.cl ../generic/geometric/clc_dot.cl ../generic/math/clc_ceil.cl diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES index 02784b8def682..ac855ea5184ed 100644 --- a/libclc/clc/lib/spirv64/SOURCES +++ b/libclc/clc/lib/spirv64/SOURCES @@ -1,3 +1,5 @@ +../generic/common/clc_degrees.cl +../generic/common/clc_radians.cl ../generic/common/clc_smoothstep.cl ../generic/geometric/clc_dot.cl ../generic/math/clc_ceil.cl diff --git a/libclc/generic/lib/common/degrees.cl b/libclc/generic/lib/common/degrees.cl index cf49b190c76b3..a9715d64f507a 100644 --- a/libclc/generic/lib/common/degrees.cl +++ b/libclc/generic/lib/common/degrees.cl @@ -22,23 +22,20 @@ #include #include +#include -_CLC_OVERLOAD _CLC_DEF float degrees(float radians) { - // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F - return 0x1.ca5dc2p+5F * radians; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, degrees, float); - +_CLC_DEFINE_UNARY_BUILTIN(float, degrees, __clc_degrees, float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_OVERLOAD _CLC_DEF double degrees(double radians) { - // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F - return 0x1.ca5dc1a63c1f8p+5 * radians; -} +_CLC_DEFINE_UNARY_BUILTIN(double, degrees, __clc_degrees, double) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, degrees, double); +_CLC_DEFINE_UNARY_BUILTIN(half, degrees, __clc_degrees, half) #endif diff --git a/libclc/generic/lib/common/radians.cl b/libclc/generic/lib/common/radians.cl index 645a30549afed..b5dcbfe6e3fd2 100644 --- a/libclc/generic/lib/common/radians.cl +++ b/libclc/generic/lib/common/radians.cl @@ -22,23 +22,20 @@ #include #include +#include -_CLC_OVERLOAD _CLC_DEF float radians(float degrees) { - // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F - return 0x1.1df46ap-6F * degrees; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, radians, float); - +_CLC_DEFINE_UNARY_BUILTIN(float, radians, __clc_radians, float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_OVERLOAD _CLC_DEF double radians(double degrees) { - // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F - return 0x1.1df46a2529d39p-6 * degrees; -} +_CLC_DEFINE_UNARY_BUILTIN(double, radians, __clc_radians, double) + +#endif + +#ifdef cl_khr_fp16 +#pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, radians, double); +_CLC_DEFINE_UNARY_BUILTIN(half, radians, __clc_radians, half) #endif diff --git a/lld/COFF/COFFLinkerContext.h b/lld/COFF/COFFLinkerContext.h index bdd625b8c3916..8322f829d4055 100644 --- a/lld/COFF/COFFLinkerContext.h +++ b/lld/COFF/COFFLinkerContext.h @@ -56,7 +56,6 @@ class COFFLinkerContext : public CommonLinkerContext { std::vector objFileInstances; std::map pdbInputFileInstances; std::vector importFileInstances; - std::vector bitcodeFileInstances; MergeChunk *mergeChunkInstances[Log2MaxSectionAlignment + 1] = {}; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 8b1a8dc3e5af7..898c6c17d2062 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -218,7 +218,7 @@ void LinkerDriver::addFile(InputFile *file) { << " linked in after " "doing LTO compilation."; } - ctx.bitcodeFileInstances.push_back(f); + f->symtab.bitcodeFileInstances.push_back(f); } else if (auto *f = dyn_cast(file)) { ctx.importFileInstances.push_back(f); } @@ -285,7 +285,7 @@ void LinkerDriver::addBuffer(std::unique_ptr mb, addFile(make(ctx, mbref)); break; case file_magic::bitcode: - addFile(make(ctx, mbref, "", 0, lazy)); + addFile(BitcodeFile::create(ctx, mbref, "", 0, lazy)); break; case file_magic::coff_object: case file_magic::coff_import_library: @@ -374,8 +374,8 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName, if (magic == file_magic::coff_object) { obj = ObjFile::create(ctx, mb); } else if (magic == file_magic::bitcode) { - obj = - make(ctx, mb, parentName, offsetInArchive, /*lazy=*/false); + obj = BitcodeFile::create(ctx, mb, parentName, offsetInArchive, + /*lazy=*/false); } else if (magic == file_magic::coff_cl_gl_object) { Err(ctx) << mb.getBufferIdentifier() << ": is not a native COFF file. Recompile without /GL?"; @@ -2571,19 +2571,19 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { } } - // If any inputs are bitcode files, the LTO code generator may create - // references to library functions that are not explicit in the bitcode - // file's symbol table. If any of those library functions are defined in a - // bitcode file in an archive member, we need to arrange to use LTO to - // compile those archive members by adding them to the link beforehand. - if (!ctx.bitcodeFileInstances.empty()) { - llvm::Triple TT( - ctx.bitcodeFileInstances.front()->obj->getTargetTriple()); - for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT)) - ctx.symtab.addLibcall(s); - } - ctx.forEachSymtab([&](SymbolTable &symtab) { + // If any inputs are bitcode files, the LTO code generator may create + // references to library functions that are not explicit in the bitcode + // file's symbol table. If any of those library functions are defined in + // a bitcode file in an archive member, we need to arrange to use LTO to + // compile those archive members by adding them to the link beforehand. + if (!symtab.bitcodeFileInstances.empty()) { + llvm::Triple TT( + symtab.bitcodeFileInstances.front()->obj->getTargetTriple()); + for (auto *s : lto::LTO::getRuntimeLibcallSymbols(TT)) + symtab.addLibcall(s); + } + // Windows specific -- if __load_config_used can be resolved, resolve // it. if (symtab.findUnderscore("_load_config_used")) @@ -2639,8 +2639,11 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // If we are going to do codegen for link-time optimization, check for // unresolvable symbols first, so we don't spend time generating code that // will fail to link anyway. - if (!ctx.bitcodeFileInstances.empty() && !config->forceUnresolved) - ctx.symtab.reportUnresolvable(); + if (!config->forceUnresolved) + ctx.forEachSymtab([](SymbolTable &symtab) { + if (!symtab.bitcodeFileInstances.empty()) + symtab.reportUnresolvable(); + }); if (errorCount()) return; @@ -2655,7 +2658,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { // link those files (unless -thinlto-index-only was given, in which case we // resolve symbols and write indices, but don't generate native code or link). ltoCompilationDone = true; - ctx.symtab.compileBitcodeFiles(); + ctx.forEachSymtab([](SymbolTable &symtab) { symtab.compileBitcodeFiles(); }); if (Defined *d = dyn_cast_or_null(ctx.symtab.findUnderscore("_tls_used"))) diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 66641ff9dcc1f..5ee73d4dc4f8b 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -1229,10 +1229,15 @@ void ImportFile::parse() { } } -BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb, - StringRef archiveName, uint64_t offsetInArchive, - bool lazy) - : InputFile(ctx.symtab, BitcodeKind, mb, lazy) { +BitcodeFile::BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb, + std::unique_ptr &o, bool lazy) + : InputFile(symtab, BitcodeKind, mb, lazy) { + obj.swap(o); +} + +BitcodeFile *BitcodeFile::create(COFFLinkerContext &ctx, MemoryBufferRef mb, + StringRef archiveName, + uint64_t offsetInArchive, bool lazy) { std::string path = mb.getBufferIdentifier().str(); if (ctx.config.thinLTOIndexOnly) path = replaceThinLTOSuffix(mb.getBufferIdentifier(), @@ -1252,7 +1257,9 @@ BitcodeFile::BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb, sys::path::filename(path) + utostr(offsetInArchive))); - obj = check(lto::InputFile::create(mbref)); + std::unique_ptr obj = check(lto::InputFile::create(mbref)); + return make(ctx.getSymtab(getMachineType(obj.get())), mb, obj, + lazy); } BitcodeFile::~BitcodeFile() = default; @@ -1329,7 +1336,7 @@ void BitcodeFile::parseLazy() { } } -MachineTypes BitcodeFile::getMachineType() const { +MachineTypes BitcodeFile::getMachineType(const llvm::lto::InputFile *obj) { Triple t(obj->getTargetTriple()); switch (t.getArch()) { case Triple::x86_64: diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index d3075c5e0a338..823561cda247a 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -386,13 +386,19 @@ class ImportFile : public InputFile { // Used for LTO. class BitcodeFile : public InputFile { public: - explicit BitcodeFile(COFFLinkerContext &ctx, MemoryBufferRef mb, - StringRef archiveName, uint64_t offsetInArchive, - bool lazy); + explicit BitcodeFile(SymbolTable &symtab, MemoryBufferRef mb, + std::unique_ptr &obj, bool lazy); ~BitcodeFile(); + + static BitcodeFile *create(COFFLinkerContext &ctx, MemoryBufferRef mb, + StringRef archiveName, uint64_t offsetInArchive, + bool lazy); static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; } ArrayRef getSymbols() { return symbols; } - MachineTypes getMachineType() const override; + MachineTypes getMachineType() const override { + return getMachineType(obj.get()); + } + static MachineTypes getMachineType(const llvm::lto::InputFile *obj); void parseLazy(); std::unique_ptr obj; diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 36dcd0dfe1389..bf965e8a2332d 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -347,8 +347,8 @@ bool SymbolTable::handleMinGWAutomaticImport(Symbol *sym, StringRef name) { /// defined symbol imported" diagnostic for symbols in localImports. /// objFiles and bitcodeFiles (if not nullptr) are used to report where /// undefined symbols are referenced. -static void reportProblemSymbols( - COFFLinkerContext &ctx, const SmallPtrSetImpl &undefs, +void SymbolTable::reportProblemSymbols( + const SmallPtrSetImpl &undefs, const DenseMap *localImports, bool needBitcodeFiles) { // Return early if there is nothing to report (which should be // the common case). @@ -392,7 +392,7 @@ static void reportProblemSymbols( processFile(file, file->getSymbols()); if (needBitcodeFiles) - for (BitcodeFile *file : ctx.bitcodeFileInstances) + for (BitcodeFile *file : bitcodeFileInstances) processFile(file, file->getSymbols()); for (const UndefinedDiag &undefDiag : undefDiags) @@ -423,8 +423,7 @@ void SymbolTable::reportUnresolvable() { undefs.insert(sym); } - reportProblemSymbols(ctx, undefs, - /* localImports */ nullptr, true); + reportProblemSymbols(undefs, /*localImports=*/nullptr, true); } bool SymbolTable::resolveRemainingUndefines() { @@ -506,8 +505,8 @@ bool SymbolTable::resolveRemainingUndefines() { } reportProblemSymbols( - ctx, undefs, - ctx.config.warnLocallyDefinedImported ? &localImports : nullptr, false); + undefs, ctx.config.warnLocallyDefinedImported ? &localImports : nullptr, + false); return foundLazy; } @@ -1124,13 +1123,13 @@ Symbol *SymbolTable::addUndefined(StringRef name) { } void SymbolTable::compileBitcodeFiles() { - if (ctx.bitcodeFileInstances.empty()) + if (bitcodeFileInstances.empty()) return; llvm::TimeTraceScope timeScope("Compile bitcode"); ScopedTimer t(ctx.ltoTimer); lto.reset(new BitcodeCompiler(ctx)); - for (BitcodeFile *f : ctx.bitcodeFileInstances) + for (BitcodeFile *f : bitcodeFileInstances) lto->add(*f); for (InputFile *newObj : lto->compile()) { ObjFile *obj = cast(newObj); diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index 9e316fcdbe630..66bca0d63e5ff 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -14,6 +14,7 @@ #include "llvm/ADT/CachedHashString.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/raw_ostream.h" namespace llvm { @@ -155,6 +156,8 @@ class SymbolTable { callback(pair.second); } + std::vector bitcodeFileInstances; + DefinedRegular *loadConfigSym = nullptr; uint32_t loadConfigSize = 0; void initializeLoadConfig(); @@ -175,6 +178,11 @@ class SymbolTable { std::unique_ptr lto; std::vector> entryThunks; llvm::DenseMap exitThunks; + + void + reportProblemSymbols(const llvm::SmallPtrSetImpl &undefs, + const llvm::DenseMap *localImports, + bool needBitcodeFiles); }; std::vector getSymbolLocations(ObjFile *file, uint32_t symIndex); diff --git a/lld/test/COFF/lto-arm64x.ll b/lld/test/COFF/lto-arm64x.ll new file mode 100644 index 0000000000000..bbfc6b64c6fce --- /dev/null +++ b/lld/test/COFF/lto-arm64x.ll @@ -0,0 +1,47 @@ +; REQUIRES: aarch64, x86 +; RUN: split-file %s %t.dir && cd %t.dir + +; RUN: llvm-as arm64ec.ll -o arm64ec.obj +; RUN: llvm-as aarch64.ll -o aarch64.obj +; RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj +; RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj + +; RUN: lld-link -machine:arm64x aarch64.obj arm64ec.obj loadconfig-arm64.obj loadconfig-arm64ec.obj -out:out.exe -subsystem:console +; RUN: llvm-objdump -d out.exe | FileCheck %s + +; CHECK: 0000000140001000 <.text>: +; CHECK-NEXT: 140001000: 52800020 mov w0, #0x1 // =1 +; CHECK-NEXT: 140001004: d65f03c0 ret +; CHECK-NEXT: ... +; CHECK-NEXT: 140002000: 00000009 udf #0x9 +; CHECK-NEXT: 140002004: 52800040 mov w0, #0x2 // =2 +; CHECK-NEXT: 140002008: d65f03c0 ret + +; CHECK: 0000000140003000 <.hexpthk>: +; CHECK-NEXT: 140003000: 48 8b c4 movq %rsp, %rax +; CHECK-NEXT: 140003003: 48 89 58 20 movq %rbx, 0x20(%rax) +; CHECK-NEXT: 140003007: 55 pushq %rbp +; CHECK-NEXT: 140003008: 5d popq %rbp +; CHECK-NEXT: 140003009: e9 f6 ef ff ff jmp 0x140002004 <.text+0x1004> +; CHECK-NEXT: 14000300e: cc int3 +; CHECK-NEXT: 14000300f: cc int3 + +#--- arm64ec.ll + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64ec-unknown-windows-msvc" + +define dso_local i32 @mainCRTStartup() { +entry: + ret i32 2 +} + +#--- aarch64.ll + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64-unknown-windows-msvc" + +define dso_local i32 @mainCRTStartup() { +entry: + ret i32 1 +} diff --git a/lldb/include/lldb/Symbol/Function.h b/lldb/include/lldb/Symbol/Function.h index 157c007bdf0e8..d0b27269568b0 100644 --- a/lldb/include/lldb/Symbol/Function.h +++ b/lldb/include/lldb/Symbol/Function.h @@ -454,6 +454,11 @@ class Function : public UserID, public SymbolContextScope { /// and variables). const Address &GetAddress() const { return m_address; } + bool GetRangeContainingLoadAddress(lldb::addr_t load_addr, Target &target, + AddressRange &range) { + return m_block.GetRangeContainingLoadAddress(load_addr, target, range); + } + lldb::LanguageType GetLanguage() const; /// Find the file and line number of the source location of the start of the /// function. This will use the declaration if present and fall back on the diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp index 4e61c83889b0b..cc848076dab5f 100644 --- a/lldb/source/API/SBThread.cpp +++ b/lldb/source/API/SBThread.cpp @@ -842,7 +842,6 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame, // appropriate error message. bool all_in_function = true; - AddressRange fun_range = frame_sc.function->GetAddressRange(); std::vector step_over_until_addrs; const bool abort_other_plans = false; @@ -859,7 +858,9 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame, addr_t step_addr = sc.line_entry.range.GetBaseAddress().GetLoadAddress(target); if (step_addr != LLDB_INVALID_ADDRESS) { - if (fun_range.ContainsLoadAddress(step_addr, target)) + AddressRange unused_range; + if (frame_sc.function->GetRangeContainingLoadAddress(step_addr, *target, + unused_range)) step_over_until_addrs.push_back(step_addr); else all_in_function = false; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp index 6d073411de876..c2edc52aa964f 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp @@ -281,22 +281,34 @@ bool DWARFDebugInfoEntry::GetDIENamesAndRanges( return !ranges.empty(); } -// Get all attribute values for a given DIE, including following any -// specification or abstract origin attributes and including those in the -// results. Any duplicate attributes will have the first instance take -// precedence (this can happen for declaration attributes). -void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu, - DWARFAttributes &attributes, - Recurse recurse, - uint32_t curr_depth) const { - const auto *abbrevDecl = GetAbbreviationDeclarationPtr(cu); - if (!abbrevDecl) { - attributes.Clear(); - return; - } +/// Helper for the public \ref DWARFDebugInfoEntry::GetAttributes API. +/// Adds all attributes of the DIE at the top of the \c worklist to the +/// \c attributes list. Specifcations and abstract origins are added +/// to the \c worklist if the referenced DIE has not been seen before. +static bool GetAttributes(llvm::SmallVector &worklist, + llvm::SmallSet &seen, + DWARFAttributes &attributes) { + assert(!worklist.empty() && "Need at least one DIE to visit."); + assert(seen.size() >= 1 && + "Need to have seen at least the currently visited entry."); + + DWARFDIE current = worklist.pop_back_val(); + + const auto *cu = current.GetCU(); + assert(cu); + + const auto *entry = current.GetDIE(); + assert(entry); + + const auto *abbrevDecl = + entry->GetAbbreviationDeclarationPtr(current.GetCU()); + if (!abbrevDecl) + return false; const DWARFDataExtractor &data = cu->GetData(); - lldb::offset_t offset = GetFirstAttributeOffset(); + lldb::offset_t offset = current.GetDIE()->GetFirstAttributeOffset(); + + const bool is_first_die = seen.size() == 1; for (const auto &attribute : abbrevDecl->attributes()) { DWARFFormValue form_value(cu); @@ -309,10 +321,10 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu, switch (attr) { case DW_AT_sibling: case DW_AT_declaration: - if (curr_depth > 0) { + if (!is_first_die) { // This attribute doesn't make sense when combined with the DIE that // references this DIE. We know a DIE is referencing this DIE because - // curr_depth is not zero + // we've visited more than one DIE already. break; } [[fallthrough]]; @@ -321,13 +333,12 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu, break; } - if (recurse == Recurse::yes && - ((attr == DW_AT_specification) || (attr == DW_AT_abstract_origin))) { + if (attr == DW_AT_specification || attr == DW_AT_abstract_origin) { if (form_value.ExtractValue(data, &offset)) { - DWARFDIE spec_die = form_value.Reference(); - if (spec_die) - spec_die.GetDIE()->GetAttributes(spec_die.GetCU(), attributes, - recurse, curr_depth + 1); + if (DWARFDIE spec_die = form_value.Reference()) { + if (seen.insert(spec_die.GetDIE()).second) + worklist.push_back(spec_die); + } } } else { const dw_form_t form = form_value.Form(); @@ -339,6 +350,34 @@ void DWARFDebugInfoEntry::GetAttributes(DWARFUnit *cu, DWARFFormValue::SkipValue(form, data, &offset, cu); } } + + return true; +} + +DWARFAttributes DWARFDebugInfoEntry::GetAttributes(const DWARFUnit *cu, + Recurse recurse) const { + // FIXME: use ElaboratingDIEIterator to follow specifications/abstract origins + // instead of maintaining our own worklist/seen list. + + DWARFAttributes attributes; + + llvm::SmallVector worklist; + worklist.emplace_back(cu, this); + + // Keep track if DIEs already seen to prevent infinite recursion. + // Value of '3' was picked for the same reason that + // DWARFDie::findRecursively does. + llvm::SmallSet seen; + seen.insert(this); + + do { + if (!::GetAttributes(worklist, seen, attributes)) { + attributes.Clear(); + break; + } + } while (!worklist.empty() && recurse == Recurse::yes); + + return attributes; } // GetAttributeValue diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h index de6bbf1d52789..72aeb2743b1e2 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h @@ -52,12 +52,28 @@ class DWARFDebugInfoEntry { lldb::offset_t *offset_ptr); using Recurse = DWARFBaseDIE::Recurse; - DWARFAttributes GetAttributes(DWARFUnit *cu, - Recurse recurse = Recurse::yes) const { - DWARFAttributes attrs; - GetAttributes(cu, attrs, recurse, 0 /* curr_depth */); - return attrs; - } + + /// Get all attribute values for a given DIE, optionally following any + /// specifications and abstract origins and including their attributes + /// in the result too. + /// + /// When following specifications/abstract origins, the attributes + /// on the referring DIE are guaranteed to be visited before the attributes of + /// the referenced DIE. + /// + /// \param[in] cu DWARFUnit that this entry belongs to. + /// + /// \param[in] recurse If set to \c Recurse::yes, will include attributes + /// on DIEs referenced via \c DW_AT_specification and \c DW_AT_abstract_origin + /// (including across multiple levels of indirection). + /// + /// \returns DWARFAttributes that include all attributes found on this DIE + /// (and possibly referenced DIEs). Attributes may appear multiple times + /// (e.g., if a declaration and definition both specify the same attribute). + /// On failure, the returned DWARFAttributes will be empty. + /// + DWARFAttributes GetAttributes(const DWARFUnit *cu, + Recurse recurse = Recurse::yes) const; dw_offset_t GetAttributeValue(const DWARFUnit *cu, const dw_attr_t attr, DWARFFormValue &formValue, @@ -178,10 +194,6 @@ class DWARFDebugInfoEntry { /// A copy of the DW_TAG value so we don't have to go through the compile /// unit abbrev table dw_tag_t m_tag = llvm::dwarf::DW_TAG_null; - -private: - void GetAttributes(DWARFUnit *cu, DWARFAttributes &attrs, Recurse recurse, - uint32_t curr_depth) const; }; } // namespace dwarf } // namespace lldb_private::plugin diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 2f451d173c4dd..ad5005b660c64 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -3414,7 +3414,10 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc, mangled = form_value.AsCString(); break; case DW_AT_type: - type_die_form = form_value; + // DW_AT_type on declaration may be less accurate than + // that of definition, so don't overwrite it. + if (!type_die_form.IsValid()) + type_die_form = form_value; break; case DW_AT_external: is_external = form_value.Boolean(); diff --git a/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py new file mode 100644 index 0000000000000..59e028acf014c --- /dev/null +++ b/lldb/test/API/functionalities/thread/step_until/TestStepUntilAPI.py @@ -0,0 +1,136 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class TestStepUntilAPI(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def setUp(self): + super().setUp() + + self.main_source = "main.c" + self.main_spec = lldb.SBFileSpec(self.main_source) + self.less_than_two = line_number("main.c", "Less than 2") + self.greater_than_two = line_number("main.c", "Greater than or equal to 2.") + self.back_out_in_main = line_number("main.c", "Back out in main") + self.in_foo = line_number("main.c", "In foo") + + def _build_dict_for_discontinuity(self): + return dict( + CFLAGS_EXTRAS="-funique-basic-block-section-names " + + "-ffunction-sections -fbasic-block-sections=list=" + + self.getSourcePath("function.list"), + LD_EXTRAS="-Wl,--script=" + self.getSourcePath("symbol.order"), + ) + + def _do_until(self, build_dict, args, until_line, expected_line): + self.build(dictionary=build_dict) + launch_info = lldb.SBLaunchInfo(args) + _, _, thread, _ = lldbutil.run_to_source_breakpoint( + self, "At the start", self.main_spec, launch_info + ) + + self.assertSuccess( + thread.StepOverUntil(self.frame(), self.main_spec, until_line) + ) + + self.runCmd("process status") + + line = self.frame().GetLineEntry().GetLine() + self.assertEqual( + line, expected_line, "Did not get the expected stop line number" + ) + + def _assertDiscontinuity(self): + target = self.target() + foo = target.FindFunctions("foo") + self.assertEqual(len(foo), 1) + foo = foo[0] + + call_me = self.target().FindFunctions("call_me") + self.assertEqual(len(call_me), 1) + call_me = call_me[0] + + foo_addr = foo.function.GetStartAddress().GetLoadAddress(target) + found_before = False + found_after = False + for range in call_me.function.GetRanges(): + addr = range.GetBaseAddress().GetLoadAddress(target) + if addr < foo_addr: + found_before = True + if addr > foo_addr: + found_after = True + + self.assertTrue( + found_before and found_after, + "'foo' is not between 'call_me'" + str(foo) + str(call_me), + ) + + def test_hitting(self): + """Test SBThread.StepOverUntil - targeting a line and hitting it.""" + self._do_until(None, None, self.less_than_two, self.less_than_two) + + @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + @skipIf(archs=no_match(["x86_64", "aarch64"])) + def test_hitting_discontinuous(self): + """Test SBThread.StepOverUntil - targeting a line and hitting it -- with + discontinuous functions""" + self._do_until( + self._build_dict_for_discontinuity(), + None, + self.less_than_two, + self.less_than_two, + ) + self._assertDiscontinuity() + + def test_missing(self): + """Test SBThread.StepOverUntil - targeting a line and missing it by stepping out to call site""" + self._do_until( + None, ["foo", "bar", "baz"], self.less_than_two, self.back_out_in_main + ) + + @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + @skipIf(archs=no_match(["x86_64", "aarch64"])) + def test_missing_discontinuous(self): + """Test SBThread.StepOverUntil - targeting a line and missing it by + stepping out to call site -- with discontinuous functions""" + self._do_until( + self._build_dict_for_discontinuity(), + ["foo", "bar", "baz"], + self.less_than_two, + self.back_out_in_main, + ) + self._assertDiscontinuity() + + def test_bad_line(self): + """Test that we get an error if attempting to step outside the current + function""" + self.build() + _, _, thread, _ = lldbutil.run_to_source_breakpoint( + self, "At the start", self.main_spec + ) + self.assertIn( + "step until target not in current function", + thread.StepOverUntil( + self.frame(), self.main_spec, self.in_foo + ).GetCString(), + ) + + @skipIf(oslist=lldbplatformutil.getDarwinOSTriples() + ["windows"]) + @skipIf(archs=no_match(["x86_64", "aarch64"])) + def test_bad_line_discontinuous(self): + """Test that we get an error if attempting to step outside the current + function -- and the function is discontinuous""" + self.build(dictionary=self._build_dict_for_discontinuity()) + _, _, thread, _ = lldbutil.run_to_source_breakpoint( + self, "At the start", self.main_spec + ) + self.assertIn( + "step until target not in current function", + thread.StepOverUntil( + self.frame(), self.main_spec, self.in_foo + ).GetCString(), + ) + self._assertDiscontinuity() diff --git a/lldb/test/API/functionalities/thread/step_until/function.list b/lldb/test/API/functionalities/thread/step_until/function.list new file mode 100644 index 0000000000000..5900fe8c35069 --- /dev/null +++ b/lldb/test/API/functionalities/thread/step_until/function.list @@ -0,0 +1 @@ +!call_me diff --git a/lldb/test/API/functionalities/thread/step_until/main.c b/lldb/test/API/functionalities/thread/step_until/main.c index bb866079cf5f5..4c52308f030e9 100644 --- a/lldb/test/API/functionalities/thread/step_until/main.c +++ b/lldb/test/API/functionalities/thread/step_until/main.c @@ -4,6 +4,9 @@ * unrelated to the program, just to achieve consistent * debug line tables, across platforms, that are not * dependent on compiler optimzations. */ + +int foo(int x) { return x; /* In foo */ } + int call_me(int argc) { printf ("At the start, argc: %d.\n", argc); diff --git a/lldb/test/API/functionalities/thread/step_until/symbol.order b/lldb/test/API/functionalities/thread/step_until/symbol.order new file mode 100644 index 0000000000000..dcc9607a4188f --- /dev/null +++ b/lldb/test/API/functionalities/thread/step_until/symbol.order @@ -0,0 +1,9 @@ +SECTIONS { + .text.ordered : { + *(.text.call_me) + *(.text.foo) + *(.text.call_me.call_me.__part.1) + *(.text.call_me.call_me.__part.2) + *(.text.call_me.call_me.__part.3) + } +} INSERT BEFORE .text; diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp index 1e4c8f3ba0778..3f61d1607073c 100644 --- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp +++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp @@ -394,3 +394,643 @@ TEST(DWARFDIETest, GetContextInFunction) { EXPECT_THAT(foo_struct_die.GetTypeLookupContext(), testing::ElementsAre(make_struct("struct_t"))); } + +struct GetAttributesTestFixture : public testing::TestWithParam {}; + +TEST_P(GetAttributesTestFixture, TestGetAttributes_IterationOrder) { + // Tests that we accumulate all current DIE's attributes first + // before checking the attributes of the specification. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_low_pc + Form: DW_FORM_data4 + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: {0} + Form: DW_FORM_ref4 + - Attribute: DW_AT_low_pc + Form: DW_FORM_data4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_high_pc [DW_FORM_data4] +# DW_AT_name [DW_FORM_strp] ("func") +# DW_AT_low_pc [DW_FORM_data4] + - AbbrCode: 0x2 + Values: + - Value: 0xdeadbeef + - Value: 0x0 + - Value: 0x1 + - Value: 0x1 + - Value: 0xdeadbeef + +# DW_TAG_subprogram +# DW_AT_high_pc [DW_FORM_data4] +# DW_AT_specification [DW_FORM_ref4] ("func") +# DW_AT_low_pc [DW_FORM_data4] + - AbbrCode: 0x3 + Values: + - Value: 0xf00dcafe + - Value: 0xf + - Value: 0xf00dcafe + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto declaration = cu_die.GetFirstChild(); + ASSERT_TRUE(declaration.IsValid()); + ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram); + + auto definition = declaration.GetSibling(); + ASSERT_TRUE(definition.IsValid()); + ASSERT_EQ(definition.Tag(), DW_TAG_subprogram); + ASSERT_FALSE(definition.GetAttributeValueAsOptionalUnsigned(DW_AT_external)); + + auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 7U); + + // Check that the attributes on the definition (that are also present + // on the declaration) take precedence. + for (auto attr : {DW_AT_low_pc, DW_AT_high_pc}) { + auto idx = attrs.FindAttributeIndex(attr); + EXPECT_NE(idx, UINT32_MAX); + + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(idx, form_value); + EXPECT_TRUE(success); + + EXPECT_EQ(form_value.Unsigned(), 0xf00dcafe); + } +} + +TEST_P(GetAttributesTestFixture, TestGetAttributes_Cycle) { + // Tests that GetAttributes can deal with cycles in + // specifications/abstract origins. + // + // Contrived example: + // + // func1 -> func3 + // ^ | + // | v + // +------func2 + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: {0} + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + + - AbbrCode: 0x2 + Values: + - Value: 0x19 + + - AbbrCode: 0x2 + Values: + - Value: 0xf + + - AbbrCode: 0x2 + Values: + - Value: 0x14 + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto func1 = cu_die.GetFirstChild(); + ASSERT_TRUE(func1.IsValid()); + ASSERT_EQ(func1.Tag(), DW_TAG_subprogram); + + auto func2 = func1.GetSibling(); + ASSERT_TRUE(func2.IsValid()); + ASSERT_EQ(func2.Tag(), DW_TAG_subprogram); + + auto func3 = func2.GetSibling(); + ASSERT_TRUE(func3.IsValid()); + ASSERT_EQ(func3.Tag(), DW_TAG_subprogram); + + auto attrs = func1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 3U); + + // Confirm that the specifications do form a cycle. + { + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(0, form_value); + ASSERT_TRUE(success); + + EXPECT_EQ(form_value.Reference(), func3); + } + + { + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(1, form_value); + ASSERT_TRUE(success); + + EXPECT_EQ(form_value.Reference(), func2); + } + + { + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(2, form_value); + ASSERT_TRUE(success); + + EXPECT_EQ(form_value.Reference(), func1); + } +} + +TEST_P(GetAttributesTestFixture, + TestGetAttributes_SkipNonApplicableAttributes) { + // Tests that GetAttributes will omit attributes found through + // specifications/abstract origins which are not applicable. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_sibling + Form: DW_FORM_ref4 + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: {0} + Form: DW_FORM_ref4 + - Attribute: DW_AT_sibling + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_declaration +# DW_AT_name [DW_FORM_strp] ("func") +# DW_AT_sibling + - AbbrCode: 0x2 + Values: + - Value: 0x1 + - Value: 0x0 + - Value: 0x18 + +# DW_TAG_subprogram +# DW_AT_declaration +# DW_AT_specification [DW_FORM_ref4] ("func") +# DW_AT_sibling + - AbbrCode: 0x3 + Values: + - Value: 0x1 + - Value: 0xf + - Value: 0xdeadbeef + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto declaration = cu_die.GetFirstChild(); + ASSERT_TRUE(declaration.IsValid()); + ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram); + + auto definition = declaration.GetSibling(); + ASSERT_TRUE(definition.IsValid()); + ASSERT_EQ(definition.Tag(), DW_TAG_subprogram); + + auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 4U); + EXPECT_NE(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX); + EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX); + + auto sibling_idx = attrs.FindAttributeIndex(DW_AT_sibling); + EXPECT_NE(sibling_idx, UINT32_MAX); + + DWARFFormValue form_value; + auto success = attrs.ExtractFormValueAtIndex(sibling_idx, form_value); + ASSERT_TRUE(success); + + EXPECT_EQ(form_value.Unsigned(), 0xdeadbeef); +} + +TEST_P(GetAttributesTestFixture, TestGetAttributes_NoRecurse) { + // Tests that GetAttributes will not recurse if Recurse::No is passed to it. + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_data4 + - Attribute: {0} + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_name [DW_FORM_strp] ("func") + - AbbrCode: 0x2 + Values: + - Value: 0x0 + +# DW_TAG_subprogram +# DW_AT_low_pc [DW_FORM_data4] +# DW_AT_specification [DW_FORM_ref4] + - AbbrCode: 0x3 + Values: + - Value: 0xdeadbeef + - Value: 0xf + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto declaration = cu_die.GetFirstChild(); + ASSERT_TRUE(declaration.IsValid()); + ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram); + + auto definition = declaration.GetSibling(); + ASSERT_TRUE(definition.IsValid()); + ASSERT_EQ(definition.Tag(), DW_TAG_subprogram); + + auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::no); + EXPECT_EQ(attrs.Size(), 2U); + EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX); + EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX); + EXPECT_NE(attrs.FindAttributeIndex(DW_AT_low_pc), UINT32_MAX); +} + +TEST_P(GetAttributesTestFixture, TestGetAttributes_InvalidSpec) { + // Test that GetAttributes doesn't try following invalid + // specifications (but still add it to the list of attributes). + + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - func + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: {0} + Form: DW_FORM_ref4 + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + +# DW_TAG_compile_unit +# DW_AT_language [DW_FORM_data2] (DW_LANG_C_plus_plus) + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram +# DW_AT_name [DW_FORM_strp] ("func") + - AbbrCode: 0x2 + Values: + - Value: 0x0 + +# DW_TAG_subprogram +# DW_AT_specification [DW_FORM_ref4] + - AbbrCode: 0x3 + Values: + - Value: 0xdeadbeef + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(llvm::formatv(yamldata, GetParam()).str()); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto declaration = cu_die.GetFirstChild(); + ASSERT_TRUE(declaration.IsValid()); + ASSERT_EQ(declaration.Tag(), DW_TAG_subprogram); + + auto definition = declaration.GetSibling(); + ASSERT_TRUE(definition.IsValid()); + ASSERT_EQ(definition.Tag(), DW_TAG_subprogram); + + auto attrs = definition.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 1U); + EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_name), UINT32_MAX); + EXPECT_NE(attrs.FindAttributeIndex(GetParam()), UINT32_MAX); +} + +TEST(DWARFDIETest, TestGetAttributes_Worklist) { + // Test that GetAttributes will follow both the abstract origin + // and specification on a single DIE correctly (omitting non-applicable + // attributes in the process). + + // Contrived example where + // f1---> f2 --> f4 + // `-> f3 `-> f5 + // + const char *yamldata = R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_AARCH64 +DWARF: + debug_str: + - foo + - bar + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Code: 0x2 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_specification + Form: DW_FORM_ref4 + - Attribute: DW_AT_abstract_origin + Form: DW_FORM_ref4 + - Code: 0x3 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_artificial + Form: DW_FORM_flag_present + + debug_info: + - Version: 5 + UnitType: DW_UT_compile + AddrSize: 8 + Entries: + + - AbbrCode: 0x1 + Values: + - Value: 0x04 + +# DW_TAG_subprogram ("f1") +# DW_AT_specification [DW_FORM_ref4] ("f2") +# DW_AT_abstract_origin [DW_FORM_ref4] ("f3") + - AbbrCode: 0x2 + Values: + - Value: 0x18 + - Value: 0x21 + +# DW_TAG_subprogram ("f2") +# DW_AT_specification [DW_FORM_ref4] ("f4") +# DW_AT_abstract_origin [DW_FORM_ref4] ("f5") + - AbbrCode: 0x2 + Values: + - Value: 0x22 + - Value: 0x23 + +# DW_TAG_subprogram ("f3") +# DW_AT_declaration [DW_FORM_flag_present] +# DW_AT_artificial [DW_FORM_flag_present] + - AbbrCode: 0x3 + Values: + - Value: 0x1 + - Value: 0x1 + +# DW_TAG_subprogram ("f4") +# DW_AT_declaration [DW_FORM_flag_present] +# DW_AT_artificial [DW_FORM_flag_present] + - AbbrCode: 0x3 + Values: + - Value: 0x1 + - Value: 0x1 + +# DW_TAG_subprogram ("f5") +# DW_AT_declaration [DW_FORM_flag_present] +# DW_AT_artificial [DW_FORM_flag_present] + - AbbrCode: 0x3 + Values: + - Value: 0x1 + - Value: 0x1 + + - AbbrCode: 0x0 +... +)"; + YAMLModuleTester t(yamldata); + + DWARFUnit *unit = t.GetDwarfUnit(); + ASSERT_NE(unit, nullptr); + const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE(); + ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit); + ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus); + DWARFDIE cu_die(unit, cu_entry); + + auto f1 = cu_die.GetFirstChild(); + ASSERT_TRUE(f1.IsValid()); + ASSERT_EQ(f1.Tag(), DW_TAG_subprogram); + + auto attrs = f1.GetAttributes(DWARFDebugInfoEntry::Recurse::yes); + EXPECT_EQ(attrs.Size(), 7U); + EXPECT_EQ(attrs.FindAttributeIndex(DW_AT_declaration), UINT32_MAX); +} + +INSTANTIATE_TEST_SUITE_P(GetAttributeTests, GetAttributesTestFixture, + testing::Values(DW_AT_specification, + DW_AT_abstract_origin)); diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index e2af991ed37b1..10714b508ca68 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -169,10 +169,26 @@ rnk@google.com (email), [rnk](https://github.com/rnk) (GitHub) ### Backends / Targets -#### AArch64 backend +#### ARM and AArch64 backends -Tim Northover \ -t.p.northover@gmail.com (email), [TNorthover](https://github.com/TNorthover) (GitHub) +David Green \ +david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \ +Amara Emerson (esp. AArch64 GlobalISel) \ +amara@apple.com (email), [aemerson](https://github.com/aemerson) (GitHub) \ +Eli Friedman (esp. ARM64EC) \ +efriedma@quicinc.com (email), [efriedma-quic](https://github.com/efriedma-quic) (GitHub) \ +Sjoerd Meijer \ +smeijer@nvidia.com (email), [sjoerdmeijer](https://github.com/sjoerdmeijer) (GitHub) \ +Nashe Mncube \ +nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \ +Sander de Smalen (esp. scalable vectorization/SVE/SME) \ +sander.desmalen@arm.com (email), [sdesmalen-arm](https://github.com/sdesmalen-arm) (GitHub) \ +Peter Smith (Anything ABI) \ +peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \ +Oliver Stannard (esp. assembly/dissassembly) \ +oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \ +Ties Stuij (Arm GlobalISel and early arch support) \ +ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub) #### AMDGPU backend @@ -184,19 +200,6 @@ Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.co Mark Schimmel \ marksl@synopsys.com (email), [markschimmel](https://github.com/markschimmel) (GitHub) -#### ARM backend - -David Green \ -david.green@arm.com (email), [davemgreen](https://github.com/davemgreen) (GitHub) \ -Oliver Stannard (Especially assembly/dissassembly) \ -oliver.stannard@arm.com (email), [ostannard](https://github.com/ostannard) (GitHub) \ -Nashe Mncube \ -nashe.mncube@arm.com (email), [nasherm](https://github.com/nasherm) (GitHub) \ -Peter Smith (Anything ABI) \ -peter.smith@arm.com (email), [smithp35](https://github.com/smithp35) (GitHub) \ -Ties Stuij (GlobalISel and early arch support) \ -ties.stuij@arm.com (email), [stuij](https://github.com/stuij) (GitHub) - #### AVR backend Ben Shi \ @@ -480,6 +483,7 @@ James Grosbach (grosbach@apple.com) -- MC layer \ Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI, Windows codegen \ Benjamin Kramer (benny.kra@gmail.com, [d0k](https://github.com/d0k)) -- DWARF Parser \ David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \ +Tim Northover (t.p.northover@gmail.com, [TNorthover](https://github.com/TNorthover)) -- AArch64 backend \ Chad Rosier (mcrosier@codeaurora.org) -- FastISel \ Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \ Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \ diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index fe13fc676e303..71b204f9c3fec 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1870,6 +1870,13 @@ class TargetTransformInfo { /// false, but it shouldn't matter what it returns anyway. bool hasArmWideBranch(bool Thumb) const; + /// Returns a bitmask constructed from the target-features or fmv-features + /// metadata of a function. + uint64_t getFeatureMask(const Function &F) const; + + /// Returns true if this is an instance of a function with multiple versions. + bool isMultiversionedFunction(const Function &F) const; + /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; @@ -2312,6 +2319,8 @@ class TargetTransformInfo::Concept { virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; + virtual uint64_t getFeatureMask(const Function &F) const = 0; + virtual bool isMultiversionedFunction(const Function &F) const = 0; virtual unsigned getMaxNumArgs() const = 0; virtual unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const = 0; @@ -3144,6 +3153,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasArmWideBranch(Thumb); } + uint64_t getFeatureMask(const Function &F) const override { + return Impl.getFeatureMask(F); + } + + bool isMultiversionedFunction(const Function &F) const override { + return Impl.isMultiversionedFunction(F); + } + unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 7ac3063ca9a37..dcef4a1abcfa3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1039,6 +1039,10 @@ class TargetTransformInfoImplBase { bool hasArmWideBranch(bool) const { return false; } + uint64_t getFeatureMask(const Function &F) const { return 0; } + + bool isMultiversionedFunction(const Function &F) const { return false; } + unsigned getMaxNumArgs() const { return UINT_MAX; } unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const { diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 4faa090901a6a..4488a6152117c 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -896,6 +896,10 @@ inline UnaryOpc_match m_ChainedUnaryOp(unsigned Opc, return UnaryOpc_match(Opc, Op); } +template inline UnaryOpc_match m_BitCast(const Opnd &Op) { + return UnaryOpc_match(ISD::BITCAST, Op); +} + template inline UnaryOpc_match m_BSwap(const Opnd &Op) { return UnaryOpc_match(ISD::BSWAP, Op); diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 63f06a3a69298..0338770593bc4 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -270,13 +270,16 @@ void fillValidCPUArchList(SmallVectorImpl &Values); bool isX18ReservedByDefault(const Triple &TT); -// Return the priority for a given set of FMV features. +// For a given set of feature names, which can be either target-features, or +// fmv-features metadata, expand their dependencies and then return a bitmask +// corresponding to the entries of AArch64::FeatPriorities. uint64_t getFMVPriority(ArrayRef Features); -// For given feature names, return a bitmask corresponding to the entries of -// AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks themselves, -// they are sequential (0, 1, 2, 3, ...). The resulting bitmask is used at -// runtime to test whether a certain FMV feature is available on the host. +// For a given set of FMV feature names, expand their dependencies and then +// return a bitmask corresponding to the entries of AArch64::CPUFeatures. +// The values in CPUFeatures are not bitmasks themselves, they are sequential +// (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether +// a certain FMV feature is available on the host. uint64_t getCpuSupportsMask(ArrayRef Features); void PrintSupportedExtensions(); diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 7bbd469bd035d..11ccfa33821ca 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -25,6 +25,8 @@ using namespace llvm; +extern cl::opt UseDerefAtPointSemantics; + static bool isAligned(const Value *Base, Align Alignment, const DataLayout &DL) { return Base->getPointerAlignment(DL) >= Alignment; @@ -168,7 +170,7 @@ static bool isDereferenceableAndAlignedPointer( Size, DL, CtxI, AC, DT, TLI, Visited, MaxDepth); - if (CtxI) { + if (CtxI && (!UseDerefAtPointSemantics || !V->canBeFreed())) { /// Look through assumes to see if both dereferencability and alignment can /// be proven by an assume if needed. RetainedKnowledge AlignRK; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index df42dc2746daf..8b9722d047edc 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1383,6 +1383,14 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const { return TTIImpl->hasArmWideBranch(Thumb); } +uint64_t TargetTransformInfo::getFeatureMask(const Function &F) const { + return TTIImpl->getFeatureMask(F); +} + +bool TargetTransformInfo::isMultiversionedFunction(const Function &F) const { + return TTIImpl->isMultiversionedFunction(F); +} + unsigned TargetTransformInfo::getMaxNumArgs() const { return TTIImpl->getMaxNumArgs(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index de7fb21f5903e..49e5b7d9ef014 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15770,7 +15770,7 @@ SDValue DAGCombiner::foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, // FIXME: I don't think looking for bitcast intrinsically makes sense, but // removing this would require more changes. auto IsBitCastOrFree = [&TLI, FPOpcode](SDValue Op, EVT VT) { - if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT) + if (sd_match(Op, m_BitCast(m_SpecificVT(VT)))) return true; return FPOpcode == ISD::FABS ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 188a450d12fde..7dbf65fbf055b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -264,6 +264,33 @@ computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, return Result; } +/// Emit an implicit cast to convert \p XRead to type of variable \p V +static llvm::Value *emitImplicitCast(IRBuilder<> &Builder, llvm::Value *XRead, + llvm::Value *V) { + // TODO: Add this functionality to the `AtomicInfo` interface + llvm::Type *XReadType = XRead->getType(); + llvm::Type *VType = V->getType(); + if (llvm::AllocaInst *vAlloca = dyn_cast(V)) + VType = vAlloca->getAllocatedType(); + + if (XReadType->isStructTy() && VType->isStructTy()) + // No need to extract or convert. A direct + // `store` will suffice. + return XRead; + + if (XReadType->isStructTy()) + XRead = Builder.CreateExtractValue(XRead, /*Idxs=*/0); + if (VType->isIntegerTy() && XReadType->isFloatingPointTy()) + XRead = Builder.CreateFPToSI(XRead, VType); + else if (VType->isFloatingPointTy() && XReadType->isIntegerTy()) + XRead = Builder.CreateSIToFP(XRead, VType); + else if (VType->isIntegerTy() && XReadType->isIntegerTy()) + XRead = Builder.CreateIntCast(XRead, VType, true); + else if (VType->isFloatingPointTy() && XReadType->isFloatingPointTy()) + XRead = Builder.CreateFPCast(XRead, VType); + return XRead; +} + /// Make \p Source branch to \p Target. /// /// Handles two situations: @@ -8501,6 +8528,8 @@ OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc, } } checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read); + if (XRead->getType() != V.Var->getType()) + XRead = emitImplicitCast(Builder, XRead, V.Var); Builder.CreateStore(XRead, V.Var, V.IsVolatile); return Builder.saveIP(); } @@ -8785,6 +8814,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture( return AtomicResult.takeError(); Value *CapturedVal = (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second); + if (CapturedVal->getType() != V.Var->getType()) + CapturedVal = emitImplicitCast(Builder, CapturedVal, V.Var); Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile); checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture); diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 65b63955b6f6d..eddb67282fca4 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -36,7 +36,7 @@ using namespace llvm; -static cl::opt UseDerefAtPointSemantics( +cl::opt UseDerefAtPointSemantics( "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false), cl::desc("Deref attributes and metadata infer facts at definition only")); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 206e410047db5..1582d1999ca1d 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2926,26 +2926,12 @@ struct RegPairInfo { int FrameIdx; int Offset; enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type; + const TargetRegisterClass *RC; RegPairInfo() = default; bool isPaired() const { return Reg2 != AArch64::NoRegister; } - unsigned getScale() const { - switch (Type) { - case PPR: - return 2; - case GPR: - case FPR64: - case VG: - return 8; - case ZPR: - case FPR128: - return 16; - } - llvm_unreachable("Unsupported type"); - } - bool isScalable() const { return Type == PPR || Type == ZPR; } }; @@ -3023,20 +3009,27 @@ static void computeCalleeSaveRegisterPairs( RegPairInfo RPI; RPI.Reg1 = CSI[i].getReg(); - if (AArch64::GPR64RegClass.contains(RPI.Reg1)) + if (AArch64::GPR64RegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::GPR; - else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) + RPI.RC = &AArch64::GPR64RegClass; + } else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::FPR64; - else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) + RPI.RC = &AArch64::FPR64RegClass; + } else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::FPR128; - else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) + RPI.RC = &AArch64::FPR128RegClass; + } else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::ZPR; - else if (AArch64::PPRRegClass.contains(RPI.Reg1)) + RPI.RC = &AArch64::ZPRRegClass; + } else if (AArch64::PPRRegClass.contains(RPI.Reg1)) { RPI.Type = RegPairInfo::PPR; - else if (RPI.Reg1 == AArch64::VG) + RPI.RC = &AArch64::PPRRegClass; + } else if (RPI.Reg1 == AArch64::VG) { RPI.Type = RegPairInfo::VG; - else + RPI.RC = &AArch64::FIXED_REGSRegClass; + } else { llvm_unreachable("Unsupported register class."); + } // Add the stack hazard size as we transition from GPR->FPR CSRs. if (AFI->hasStackHazardSlotIndex() && @@ -3045,7 +3038,7 @@ static void computeCalleeSaveRegisterPairs( ByteOffset += StackFillDir * StackHazardSize; LastReg = RPI.Reg1; - int Scale = RPI.getScale(); + int Scale = TRI->getSpillSize(*RPI.RC); // Add the next reg to the pair if it is in the same register class. if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) { Register NextReg = CSI[i + RegInc].getReg(); @@ -3254,38 +3247,26 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! // Note: Similar rationale and sequence for restores in epilog. - unsigned Size; - Align Alignment; + unsigned Size = TRI->getSpillSize(*RPI.RC); + Align Alignment = TRI->getSpillAlign(*RPI.RC); switch (RPI.Type) { case RegPairInfo::GPR: StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; - Size = 8; - Alignment = Align(8); break; case RegPairInfo::FPR64: StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; - Size = 8; - Alignment = Align(8); break; case RegPairInfo::FPR128: StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; - Size = 16; - Alignment = Align(16); break; case RegPairInfo::ZPR: StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI; - Size = 16; - Alignment = Align(16); break; case RegPairInfo::PPR: StrOpc = AArch64::STR_PXI; - Size = 2; - Alignment = Align(2); break; case RegPairInfo::VG: StrOpc = AArch64::STRXui; - Size = 8; - Alignment = Align(8); break; } @@ -3495,33 +3476,23 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; - unsigned Size; - Align Alignment; + unsigned Size = TRI->getSpillSize(*RPI.RC); + Align Alignment = TRI->getSpillAlign(*RPI.RC); switch (RPI.Type) { case RegPairInfo::GPR: LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; - Size = 8; - Alignment = Align(8); break; case RegPairInfo::FPR64: LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; - Size = 8; - Alignment = Align(8); break; case RegPairInfo::FPR128: LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; - Size = 16; - Alignment = Align(16); break; case RegPairInfo::ZPR: LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI; - Size = 16; - Alignment = Align(16); break; case RegPairInfo::PPR: LdrOpc = AArch64::LDR_PXI; - Size = 2; - Alignment = Align(2); break; case RegPairInfo::VG: continue; @@ -3795,14 +3766,15 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned CSStackSize = 0; unsigned SVECSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned Reg : SavedRegs.set_bits()) { - auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; + auto *RC = TRI->getMinimalPhysRegClass(Reg); + assert(RC && "expected register class!"); + auto SpillSize = TRI->getSpillSize(*RC); if (AArch64::PPRRegClass.contains(Reg) || AArch64::ZPRRegClass.contains(Reg)) - SVECSStackSize += RegSize; + SVECSStackSize += SpillSize; else - CSStackSize += RegSize; + CSStackSize += SpillSize; } // Increase the callee-saved stack size if the function has streaming mode diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d4a114c275fb7..7d3ca46204b67 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2373,8 +2373,9 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant( return false; unsigned Size = VT.getSizeInBits(); - assert((Size == 32 || Size == 64) && - "i32 or i64 is expected after legalization."); + + if (Size != 32 && Size != 64) + return false; // Exit early if we demand all bits. if (DemandedBits.popcount() == Size) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 932a6f9ce23fd..7f10bfed739b4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -248,6 +249,19 @@ static bool hasPossibleIncompatibleOps(const Function *F) { return false; } +uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const { + StringRef AttributeStr = + isMultiversionedFunction(F) ? "fmv-features" : "target-features"; + StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString(); + SmallVector Features; + FeatureStr.split(Features, ","); + return AArch64::getFMVPriority(Features); +} + +bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const { + return F.hasFnAttribute("fmv-features"); +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 8e7e590c173ff..1eb805ae00b1b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -89,6 +89,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const; + uint64_t getFeatureMask(const Function &F) const; + + bool isMultiversionedFunction(const Function &F) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 169f1369fb543..7de64bddf7884 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -715,7 +715,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()), MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()), - Mode(MFI.getMode()) { + Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) { for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) SpillPhysVGPRS.push_back(regToString(Reg, TRI)); diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index efc8b77f8d8fa..420b98b8a9c1f 100644 --- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -1009,7 +1009,8 @@ void LoongArchAsmParser::emitLoadAddressPcrel(MCInst &Inst, SMLoc IDLoc, Insts.push_back( LoongArchAsmParser::Inst(ADDI, LoongArchMCExpr::VK_LoongArch_PCALA_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressPcrelLarge(MCInst &Inst, SMLoc IDLoc, @@ -1083,7 +1084,8 @@ void LoongArchAsmParser::emitLoadAddressGot(MCInst &Inst, SMLoc IDLoc, Insts.push_back( LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressGotLarge(MCInst &Inst, SMLoc IDLoc, @@ -1176,7 +1178,8 @@ void LoongArchAsmParser::emitLoadAddressTLSIE(MCInst &Inst, SMLoc IDLoc, Insts.push_back(LoongArchAsmParser::Inst( LD, LoongArchMCExpr::VK_LoongArch_TLS_IE_PC_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressTLSIELarge(MCInst &Inst, SMLoc IDLoc, @@ -1248,7 +1251,8 @@ void LoongArchAsmParser::emitLoadAddressTLSLD(MCInst &Inst, SMLoc IDLoc, Insts.push_back(LoongArchAsmParser::Inst( ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressTLSLDLarge(MCInst &Inst, SMLoc IDLoc, @@ -1320,7 +1324,8 @@ void LoongArchAsmParser::emitLoadAddressTLSGD(MCInst &Inst, SMLoc IDLoc, Insts.push_back(LoongArchAsmParser::Inst( ADDI, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressTLSGDLarge(MCInst &Inst, SMLoc IDLoc, @@ -1409,7 +1414,8 @@ void LoongArchAsmParser::emitLoadAddressTLSDesc(MCInst &Inst, SMLoc IDLoc, Insts.push_back(LoongArchAsmParser::Inst( LoongArch::JIRL, LoongArchMCExpr::VK_LoongArch_TLS_DESC_CALL)); - emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out); + emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, + /*RelaxHint=*/true); } void LoongArchAsmParser::emitLoadAddressTLSDescLarge(MCInst &Inst, SMLoc IDLoc, @@ -1500,8 +1506,9 @@ void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc, IsTailCall ? Inst.getOperand(0).getReg() : MCRegister(LoongArch::R1); const MCExpr *Sym = IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr(); - const LoongArchMCExpr *LE = LoongArchMCExpr::create( - Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext()); + const LoongArchMCExpr *LE = + LoongArchMCExpr::create(Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, + getContext(), /*RelaxHint=*/true); Out.emitInstruction( MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE), diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp index 359bde1244429..04d57f0fe7457 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp @@ -282,9 +282,11 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO, break; case LoongArchMCExpr::VK_LoongArch_TLS_LE_HI20_R: FixupKind = LoongArch::fixup_loongarch_tls_le_hi20_r; + RelaxCandidate = true; break; case LoongArchMCExpr::VK_LoongArch_TLS_LE_LO12_R: FixupKind = LoongArch::fixup_loongarch_tls_le_lo12_r; + RelaxCandidate = true; break; case LoongArchMCExpr::VK_LoongArch_PCREL20_S2: FixupKind = LoongArch::fixup_loongarch_pcrel20_s2; @@ -387,11 +389,17 @@ void LoongArchMCCodeEmitter::expandAddTPRel(const MCInst &MI, "Expected %le_add_r relocation on TP-relative symbol"); // Emit the correct %le_add_r relocation for the symbol. - // TODO: Emit R_LARCH_RELAX for %le_add_r where the relax feature is enabled. Fixups.push_back(MCFixup::create( 0, Expr, MCFixupKind(LoongArch::fixup_loongarch_tls_le_add_r), MI.getLoc())); + // Emit R_LARCH_RELAX for %le_add_r when the relax feature is enabled. + if (STI.hasFeature(LoongArch::FeatureRelax)) { + const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx); + Fixups.push_back(MCFixup::create( + 0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_relax), MI.getLoc())); + } + // Emit a normal ADD instruction with the given operands. unsigned ADD = MI.getOpcode() == LoongArch::PseudoAddTPRel_D ? LoongArch::ADD_D diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index fc8a0eaed140d..7fbba7f05e0a5 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -578,10 +578,10 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, unsigned Opc; switch (Opcode) { case X86::PTILELOADDRSV: - Opc = X86::TILELOADDRS; + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS); break; case X86::PTILELOADDRST1V: - Opc = X86::TILELOADDRST1; + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1); break; case X86::PTILELOADDV: Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD); @@ -737,28 +737,28 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, unsigned Opc; switch (Opcode) { case X86::PT2RPNTLVWZ0V: - Opc = X86::T2RPNTLVWZ0; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); break; case X86::PT2RPNTLVWZ0T1V: - Opc = X86::T2RPNTLVWZ0T1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); break; case X86::PT2RPNTLVWZ1V: - Opc = X86::T2RPNTLVWZ1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); break; case X86::PT2RPNTLVWZ1T1V: - Opc = X86::T2RPNTLVWZ1T1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); break; case X86::PT2RPNTLVWZ0RSV: - Opc = X86::T2RPNTLVWZ0RS; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); break; case X86::PT2RPNTLVWZ0RST1V: - Opc = X86::T2RPNTLVWZ0RST1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); break; case X86::PT2RPNTLVWZ1RSV: - Opc = X86::T2RPNTLVWZ1RS; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); break; case X86::PT2RPNTLVWZ1RST1V: - Opc = X86::T2RPNTLVWZ1RST1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); break; default: llvm_unreachable("Impossible Opcode!"); diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 9b340a778b36a..84bcdae520885 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1800,10 +1800,10 @@ void X86DAGToDAGISel::emitFunctionEntryCode() { emitSpecialCodeForMain(); } -static bool isDispSafeForFrameIndex(int64_t Val) { - // On 64-bit platforms, we can run into an issue where a frame index +static bool isDispSafeForFrameIndexOrRegBase(int64_t Val) { + // We can run into an issue where a frame index or a register base // includes a displacement that, when added to the explicit displacement, - // will overflow the displacement field. Assuming that the frame index + // will overflow the displacement field. Assuming that the // displacement fits into a 31-bit integer (which is only slightly more // aggressive than the current fundamental assumption that it fits into // a 32-bit integer), a 31-bit disp should always be safe. @@ -1831,7 +1831,7 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, // In addition to the checks required for a register base, check that // we do not try to use an unsafe Disp with a frame index. if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && - !isDispSafeForFrameIndex(Val)) + !isDispSafeForFrameIndexOrRegBase(Val)) return true; // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to // 64 bits. Instructions with 32-bit register addresses perform this zero @@ -1849,10 +1849,14 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, // to get an address size override to be emitted. However, this // pseudo-register is not part of any register class and therefore causes // MIR verification to fail. - if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) && + if (Subtarget->isTarget64BitILP32() && + !isDispSafeForFrameIndexOrRegBase((uint32_t)Val) && !AM.hasBaseOrIndexReg()) return true; - } + } else if (AM.hasBaseOrIndexReg() && !isDispSafeForFrameIndexOrRegBase(Val)) + // For 32-bit X86, make sure the displacement still isn't close to the + // expressible limit. + return true; AM.Disp = Val; return false; } @@ -2553,7 +2557,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, case ISD::FrameIndex: if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && - (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { + (!Subtarget->is64Bit() || isDispSafeForFrameIndexOrRegBase(AM.Disp))) { AM.BaseType = X86ISelAddressMode::FrameIndexBase; AM.Base_FrameIndex = cast(N)->getIndex(); return false; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 90e3e15b1fb46..dba38f3e1a0bc 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -26438,7 +26438,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (CC) { case ISD::SETEQ: { SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); - if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 1 + if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1 break; // (ZF = 1 and PF = 0) SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); @@ -26447,7 +26447,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case ISD::SETNE: { SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG); - if (HasAVX10_2_COMX & HasAVX10_2_COMX_Ty) // ZF == 0 + if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0 break; // (ZF = 0 or PF = 1) SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG); @@ -37800,14 +37800,14 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTILESTORED: Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED); break; -#undef GET_EGPR_IF_ENABLED case X86::PTILELOADDRS: - Opc = X86::TILELOADDRS; + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS); break; case X86::PTILELOADDRST1: - Opc = X86::TILELOADDRST1; + Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1); break; } +#undef GET_EGPR_IF_ENABLED MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc)); unsigned CurOp = 0; @@ -37838,34 +37838,36 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PT2RPNTLVWZ1RST1: { const DebugLoc &DL = MI.getDebugLoc(); unsigned Opc; +#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC) switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instruction!"); case X86::PT2RPNTLVWZ0: - Opc = X86::T2RPNTLVWZ0; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0); break; case X86::PT2RPNTLVWZ0T1: - Opc = X86::T2RPNTLVWZ0T1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1); break; case X86::PT2RPNTLVWZ1: - Opc = X86::T2RPNTLVWZ1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1); break; case X86::PT2RPNTLVWZ1T1: - Opc = X86::T2RPNTLVWZ1T1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1); break; case X86::PT2RPNTLVWZ0RS: - Opc = X86::T2RPNTLVWZ0RS; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS); break; case X86::PT2RPNTLVWZ0RST1: - Opc = X86::T2RPNTLVWZ0RST1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1); break; case X86::PT2RPNTLVWZ1RS: - Opc = X86::T2RPNTLVWZ1RS; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS); break; case X86::PT2RPNTLVWZ1RST1: - Opc = X86::T2RPNTLVWZ1RST1; + Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1); break; } +#undef GET_EGPR_IF_ENABLED MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define); @@ -58570,8 +58572,8 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); SDLoc DL(N); @@ -59264,7 +59266,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, // clang-format off default: break; case ISD::SCALAR_TO_VECTOR: - return combineScalarToVector(N, DAG, Subtarget); + return combineSCALAR_TO_VECTOR(N, DAG, Subtarget); case ISD::EXTRACT_VECTOR_ELT: case X86ISD::PEXTRW: case X86ISD::PEXTRB: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index a055ba91d3e17..85046228bc8c5 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -345,26 +345,33 @@ let Predicates = [HasAMXTILE, In64BitMode], isPseudo = true, SchedRW = [WriteSys def PTILEPAIRLOAD : PseudoI<(outs TILEPair:$dst), (ins opaquemem:$src), []>; } -let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { - let SchedRW = [WriteSystem] in { - def T2RPNTLVWZ0 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src), "t2rpntlvwz0\t{$src, $dst|$dst, $src}", - []>, VEX, WIG, T8,PS; +multiclass T2RPNTLVW_Base op1, bits<8> op2, string rs, string suffix> { + def Z0#rs#suffix : I, PS; + def Z0#rs#T1#suffix : I, PS; + def Z1#rs#suffix : I, PD; + def Z1#rs#T1#suffix : I, PD; +} - def T2RPNTLVWZ0T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src), "t2rpntlvwz0t1\t{$src, $dst|$dst, $src}", - []>, VEX, T8,PS; +let Predicates = [HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in + defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "">, T8, VEX; - def T2RPNTLVWZ1 : I<0x6e, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src), "t2rpntlvwz1\t{$src, $dst|$dst, $src}", - []>, VEX, T8,PD; +let Predicates = [HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in + defm T2RPNTLVW : T2RPNTLVW_Base<0x6e, 0x6f, "", "_EVEX">, T8, EVEX, NoCD8; - def T2RPNTLVWZ1T1 : I<0x6f, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src), "t2rpntlvwz1t1\t{$src, $dst|$dst, $src}", - []>, VEX, T8,PD; +let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in + defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "">, T_MAP5, VEX; +let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in + defm T2RPNTLVW : T2RPNTLVW_Base<0xf8, 0xf9, "RS", "_EVEX">, T_MAP5, EVEX, NoCD8; + +let Predicates = [HasAMXTRANSPOSE, In64BitMode] in { + let SchedRW = [WriteSystem] in { def TTRANSPOSED : I<0x5f, MRMSrcReg, (outs TILE:$dst), (ins TILE:$src), - "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8,XS; + "ttransposed\t{$src, $dst|$dst, $src}", []>, VEX, T8, XS; let isPseudo = true in { def PT2RPNTLVWZ0V : PseudoI<(outs TILEPair:$dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), @@ -491,22 +498,6 @@ let Predicates = [HasAMXCOMPLEX, HasAMXTRANSPOSE, In64BitMode], SchedRW = [Write } let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSystem] in { - def T2RPNTLVWZ0RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz0rs\t{$src1, $dst|$dst, $src1}", - []>, VEX, T_MAP5; - def T2RPNTLVWZ0RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz0rst1\t{$src1, $dst|$dst, $src1}", - []>, VEX, T_MAP5; - def T2RPNTLVWZ1RS : I<0xf8, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz1rs\t{$src1, $dst|$dst, $src1}", - []>, VEX, T_MAP5, PD; - def T2RPNTLVWZ1RST1 : I<0xf9, MRMSrcMemFSIB, (outs TILEPair:$dst), - (ins sibmem:$src1), - "t2rpntlvwz1rst1\t{$src1, $dst|$dst, $src1}", - []>, VEX, T_MAP5, PD; let isPseudo = true in { def PT2RPNTLVWZ0RSV : PseudoI<(outs TILEPair:$dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, opaquemem:$src4), @@ -529,16 +520,20 @@ let Predicates = [HasAMXMOVRS, HasAMXTRANSPOSE, In64BitMode], SchedRW = [WriteSy } } // HasAMXMOVRS, HasAMXTRANSPOSE -let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { - def TILELOADDRS : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), - (ins sibmem:$src1), - "tileloaddrs\t{$src1, $dst|$dst, $src1}", - []>, VEX, T8, XD; - def TILELOADDRST1 : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), - (ins sibmem:$src1), - "tileloaddrst1\t{$src1, $dst|$dst, $src1}", - []>, VEX, T8, PD; +multiclass TILELOADDRS_Base { + def suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1), + "tileloaddrs\t{$src1, $dst|$dst, $src1}", []>, T8, XD; + def T1#suffix : I<0x4a, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src1), + "tileloaddrst1\t{$src1, $dst|$dst, $src1}", []>, T8, PD; +} + +let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in + defm TILELOADDRS : TILELOADDRS_Base<"">, VEX; +let Predicates = [HasAMXMOVRS, HasEGPR, In64BitMode], SchedRW = [WriteSystem] in + defm TILELOADDRS : TILELOADDRS_Base<"_EVEX">, EVEX, NoCD8; + +let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { let isPseudo = true, mayLoad = 1 in { def PTILELOADDRSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 127016184bc17..edbcb17297603 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -1767,9 +1767,9 @@ multiclass vmovrs_p opc, string OpStr, X86VectorVTInfo _> { } multiclass vmovrs_p_vl opc, string OpStr, AVX512VLVectorVTInfo _Vec> { - let Predicates = [HasMOVRS, HasAVX10_2_512] in + let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in defm Z : vmovrs_p, EVEX_V512; - let Predicates = [HasMOVRS, HasAVX10_2] in { + let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in { defm Z128 : vmovrs_p, EVEX_V128; defm Z256 : vmovrs_p, EVEX_V256; } diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td index 9fabe2acf0019..43c02c4f85844 100644 --- a/llvm/lib/Target/X86/X86InstrMisc.td +++ b/llvm/lib/Target/X86/X86InstrMisc.td @@ -1733,7 +1733,7 @@ def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src", // let SchedRW = [WriteLoad] in { -let Predicates = [HasMOVRS, NoEGPR] in { +let Predicates = [HasMOVRS, NoEGPR, In64BitMode] in { def MOVRS8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), "movrs{b}\t{$src, $dst|$dst, $src}", [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, T8; @@ -1746,8 +1746,25 @@ def MOVRS32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), def MOVRS64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "movrs{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, T8; +} + +let Predicates = [HasMOVRS] in def PREFETCHRST2 : I<0x18, MRM4m, (outs), (ins i8mem:$src), "prefetchrst2\t$src", [(int_x86_prefetchrs addr:$src)]>, TB; + +let Predicates = [HasMOVRS, HasEGPR, In64BitMode] in { +def MOVRS8rm_EVEX : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), + "movrs{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, (int_x86_movrsqi addr:$src))]>, EVEX, NoCD8, T_MAP4; +def MOVRS16rm_EVEX : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movrs{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (int_x86_movrshi addr:$src))]>, EVEX, NoCD8, PD, T_MAP4; +def MOVRS32rm_EVEX : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "movrs{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_movrssi addr:$src))]>, EVEX, NoCD8, T_MAP4; +def MOVRS64rm_EVEX : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "movrs{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (int_x86_movrsdi addr:$src))]>, EVEX, NoCD8, T_MAP4, REX_W; +} } -} \ No newline at end of file diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 34ca03a47e0a4..e13c6e6d28c2b 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -48,12 +48,33 @@ std::optional AArch64::ArchInfo::findBySubArch(StringRef SubA return {}; } +std::optional lookupFMVByID(AArch64::ArchExtKind ExtID) { + for (const AArch64::FMVInfo &Info : AArch64::getFMVInfo()) + if (Info.ID && *Info.ID == ExtID) + return Info; + return {}; +} + uint64_t AArch64::getFMVPriority(ArrayRef Features) { - uint64_t Priority = 0; - for (StringRef Feature : Features) - if (std::optional Info = parseFMVExtension(Feature)) - Priority |= (1ULL << Info->PriorityBit); - return Priority; + // Transitively enable the Arch Extensions which correspond to each feature. + ExtensionSet FeatureBits; + for (const StringRef Feature : Features) { + std::optional FMV = parseFMVExtension(Feature); + if (!FMV) { + if (std::optional Info = targetFeatureToExtension(Feature)) + FMV = lookupFMVByID(Info->ID); + } + if (FMV && FMV->ID) + FeatureBits.enable(*FMV->ID); + } + + // Construct a bitmask for all the transitively enabled Arch Extensions. + uint64_t PriorityMask = 0; + for (const FMVInfo &Info : getFMVInfo()) + if (Info.ID && FeatureBits.Enabled.test(*Info.ID)) + PriorityMask |= (1ULL << Info.PriorityBit); + + return PriorityMask; } uint64_t AArch64::getCpuSupportsMask(ArrayRef Features) { diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 78cd249c9c16a..bf0cacc6224be 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2641,6 +2641,165 @@ DeleteDeadIFuncs(Module &M, return Changed; } +// Follows the use-def chain of \p V backwards until it finds a Function, +// in which case it collects in \p Versions. Return true on successful +// use-def chain traversal, false otherwise. +static bool collectVersions(TargetTransformInfo &TTI, Value *V, + SmallVectorImpl &Versions) { + if (auto *F = dyn_cast(V)) { + if (!TTI.isMultiversionedFunction(*F)) + return false; + Versions.push_back(F); + } else if (auto *Sel = dyn_cast(V)) { + if (!collectVersions(TTI, Sel->getTrueValue(), Versions)) + return false; + if (!collectVersions(TTI, Sel->getFalseValue(), Versions)) + return false; + } else if (auto *Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions)) + return false; + } else { + // Unknown instruction type. Bail. + return false; + } + return true; +} + +// Bypass the IFunc Resolver of MultiVersioned functions when possible. To +// deduce whether the optimization is legal we need to compare the target +// features between caller and callee versions. The criteria for bypassing +// the resolver are the following: +// +// * If the callee's feature set is a subset of the caller's feature set, +// then the callee is a candidate for direct call. +// +// * Among such candidates the one of highest priority is the best match +// and it shall be picked, unless there is a version of the callee with +// higher priority than the best match which cannot be picked from a +// higher priority caller (directly or through the resolver). +// +// * For every higher priority callee version than the best match, there +// is a higher priority caller version whose feature set availability +// is implied by the callee's feature set. +// +static bool OptimizeNonTrivialIFuncs( + Module &M, function_ref GetTTI) { + bool Changed = false; + + // Cache containing the mask constructed from a function's target features. + DenseMap FeatureMask; + + for (GlobalIFunc &IF : M.ifuncs()) { + if (IF.isInterposable()) + continue; + + Function *Resolver = IF.getResolverFunction(); + if (!Resolver) + continue; + + if (Resolver->isInterposable()) + continue; + + TargetTransformInfo &TTI = GetTTI(*Resolver); + + // Discover the callee versions. + SmallVector Callees; + if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) { + if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) + if (!collectVersions(TTI, Ret->getReturnValue(), Callees)) + return true; + return false; + })) + continue; + + assert(!Callees.empty() && "Expecting successful collection of versions"); + + // Cache the feature mask for each callee. + for (Function *Callee : Callees) { + auto [It, Inserted] = FeatureMask.try_emplace(Callee); + if (Inserted) + It->second = TTI.getFeatureMask(*Callee); + } + + // Sort the callee versions in decreasing priority order. + sort(Callees, [&](auto *LHS, auto *RHS) { + return FeatureMask[LHS] > FeatureMask[RHS]; + }); + + // Find the callsites and cache the feature mask for each caller. + SmallVector Callers; + DenseMap> CallSites; + for (User *U : IF.users()) { + if (auto *CB = dyn_cast(U)) { + if (CB->getCalledOperand() == &IF) { + Function *Caller = CB->getFunction(); + auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller); + if (FeatInserted) + FeatIt->second = TTI.getFeatureMask(*Caller); + auto [CallIt, CallInserted] = CallSites.try_emplace(Caller); + if (CallInserted) + Callers.push_back(Caller); + CallIt->second.push_back(CB); + } + } + } + + // Sort the caller versions in decreasing priority order. + sort(Callers, [&](auto *LHS, auto *RHS) { + return FeatureMask[LHS] > FeatureMask[RHS]; + }); + + auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; }; + + // Index to the highest priority candidate. + unsigned I = 0; + // Now try to redirect calls starting from higher priority callers. + for (Function *Caller : Callers) { + assert(I < Callees.size() && "Found callers of equal priority"); + + Function *Callee = Callees[I]; + uint64_t CallerBits = FeatureMask[Caller]; + uint64_t CalleeBits = FeatureMask[Callee]; + + // In the case of FMV callers, we know that all higher priority callers + // than the current one did not get selected at runtime, which helps + // reason about the callees (if they have versions that mandate presence + // of the features which we already know are unavailable on this target). + if (TTI.isMultiversionedFunction(*Caller)) { + // If the feature set of the caller implies the feature set of the + // highest priority candidate then it shall be picked. In case of + // identical sets advance the candidate index one position. + if (CallerBits == CalleeBits) + ++I; + else if (!implies(CallerBits, CalleeBits)) { + // Keep advancing the candidate index as long as the caller's + // features are a subset of the current candidate's. + while (implies(CalleeBits, CallerBits)) { + if (++I == Callees.size()) + break; + CalleeBits = FeatureMask[Callees[I]]; + } + continue; + } + } else { + // We can't reason much about non-FMV callers. Just pick the highest + // priority callee if it matches, otherwise bail. + if (I > 0 || !implies(CallerBits, CalleeBits)) + continue; + } + auto &Calls = CallSites[Caller]; + for (CallBase *CS : Calls) + CS->setCalledOperand(Callee); + Changed = true; + } + if (IF.use_empty() || + all_of(IF.users(), [](User *U) { return isa(U); })) + NumIFuncsResolved++; + } + return Changed; +} + static bool optimizeGlobalsInModule(Module &M, const DataLayout &DL, function_ref GetTLI, @@ -2707,6 +2866,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL, // Optimize IFuncs whose callee's are statically known. LocalChange |= OptimizeStaticIFuncs(M); + // Optimize IFuncs based on the target features of the caller. + LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI); + // Remove any IFuncs that are now dead. LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index d0b2ded127ff7..df5f9833a2ff9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -13,6 +13,7 @@ #include "InstCombineInternal.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" @@ -657,6 +658,94 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) { return nullptr; } +// If we have the following pattern, +// X = 1.0/sqrt(a) +// R1 = X * X +// R2 = a/sqrt(a) +// then this method collects all the instructions that match R1 and R2. +static bool getFSqrtDivOptPattern(Instruction *Div, + SmallPtrSetImpl &R1, + SmallPtrSetImpl &R2) { + Value *A; + if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) || + match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) { + for (User *U : Div->users()) { + Instruction *I = cast(U); + if (match(I, m_FMul(m_Specific(Div), m_Specific(Div)))) + R1.insert(I); + } + + CallInst *CI = cast(Div->getOperand(1)); + for (User *U : CI->users()) { + Instruction *I = cast(U); + if (match(I, m_FDiv(m_Specific(A), m_Sqrt(m_Specific(A))))) + R2.insert(I); + } + } + return !R1.empty() && !R2.empty(); +} + +// Check legality for transforming +// x = 1.0/sqrt(a) +// r1 = x * x; +// r2 = a/sqrt(a); +// +// TO +// +// r1 = 1/a +// r2 = sqrt(a) +// x = r1 * r2 +// This transform works only when 'a' is known positive. +static bool isFSqrtDivToFMulLegal(Instruction *X, + SmallPtrSetImpl &R1, + SmallPtrSetImpl &R2) { + // Check if the required pattern for the transformation exists. + if (!getFSqrtDivOptPattern(X, R1, R2)) + return false; + + BasicBlock *BBx = X->getParent(); + BasicBlock *BBr1 = (*R1.begin())->getParent(); + BasicBlock *BBr2 = (*R2.begin())->getParent(); + + CallInst *FSqrt = cast(X->getOperand(1)); + if (!FSqrt->hasAllowReassoc() || !FSqrt->hasNoNaNs() || + !FSqrt->hasNoSignedZeros() || !FSqrt->hasNoInfs()) + return false; + + // We change x = 1/sqrt(a) to x = sqrt(a) * 1/a . This change isn't allowed + // by recip fp as it is strictly meant to transform ops of type a/b to + // a * 1/b. So, this can be considered as algebraic rewrite and reassoc flag + // has been used(rather abused)in the past for algebraic rewrites. + if (!X->hasAllowReassoc() || !X->hasAllowReciprocal() || !X->hasNoInfs()) + return false; + + // Check the constraints on X, R1 and R2 combined. + // fdiv instruction and one of the multiplications must reside in the same + // block. If not, the optimized code may execute more ops than before and + // this may hamper the performance. + if (BBx != BBr1 && BBx != BBr2) + return false; + + // Check the constraints on instructions in R1. + if (any_of(R1, [BBr1](Instruction *I) { + // When you have multiple instructions residing in R1 and R2 + // respectively, it's difficult to generate combinations of (R1,R2) and + // then check if we have the required pattern. So, for now, just be + // conservative. + return (I->getParent() != BBr1 || !I->hasAllowReassoc()); + })) + return false; + + // Check the constraints on instructions in R2. + return all_of(R2, [BBr2](Instruction *I) { + // When you have multiple instructions residing in R1 and R2 + // respectively, it's difficult to generate combination of (R1,R2) and + // then check if we have the required pattern. So, for now, just be + // conservative. + return (I->getParent() == BBr2 && I->hasAllowReassoc()); + }); +} + Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { Value *Op0 = I.getOperand(0); Value *Op1 = I.getOperand(1); @@ -1913,6 +2002,74 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I, return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I); } +// Change +// X = 1/sqrt(a) +// R1 = X * X +// R2 = a * X +// +// TO +// +// FDiv = 1/a +// FSqrt = sqrt(a) +// FMul = FDiv * FSqrt +// Replace Uses Of R1 With FDiv +// Replace Uses Of R2 With FSqrt +// Replace Uses Of X With FMul +static Instruction * +convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X, + const SmallPtrSetImpl &R1, + const SmallPtrSetImpl &R2, + InstCombiner::BuilderTy &B, InstCombinerImpl *IC) { + + B.SetInsertPoint(X); + + // Have an instruction that is representative of all of instructions in R1 and + // get the most common fpmath metadata and fast-math flags on it. + Value *SqrtOp = CI->getArgOperand(0); + auto *FDiv = cast( + B.CreateFDiv(ConstantFP::get(X->getType(), 1.0), SqrtOp)); + auto *R1FPMathMDNode = (*R1.begin())->getMetadata(LLVMContext::MD_fpmath); + FastMathFlags R1FMF = (*R1.begin())->getFastMathFlags(); // Common FMF + for (Instruction *I : R1) { + R1FPMathMDNode = MDNode::getMostGenericFPMath( + R1FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath)); + R1FMF &= I->getFastMathFlags(); + IC->replaceInstUsesWith(*I, FDiv); + IC->eraseInstFromFunction(*I); + } + FDiv->setMetadata(LLVMContext::MD_fpmath, R1FPMathMDNode); + FDiv->copyFastMathFlags(R1FMF); + + // Have a single sqrt call instruction that is representative of all of + // instructions in R2 and get the most common fpmath metadata and fast-math + // flags on it. + auto *FSqrt = cast(CI->clone()); + FSqrt->insertBefore(CI); + auto *R2FPMathMDNode = (*R2.begin())->getMetadata(LLVMContext::MD_fpmath); + FastMathFlags R2FMF = (*R2.begin())->getFastMathFlags(); // Common FMF + for (Instruction *I : R2) { + R2FPMathMDNode = MDNode::getMostGenericFPMath( + R2FPMathMDNode, I->getMetadata(LLVMContext::MD_fpmath)); + R2FMF &= I->getFastMathFlags(); + IC->replaceInstUsesWith(*I, FSqrt); + IC->eraseInstFromFunction(*I); + } + FSqrt->setMetadata(LLVMContext::MD_fpmath, R2FPMathMDNode); + FSqrt->copyFastMathFlags(R2FMF); + + Instruction *FMul; + // If X = -1/sqrt(a) initially,then FMul = -(FDiv * FSqrt) + if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) { + Value *Mul = B.CreateFMul(FDiv, FSqrt); + FMul = cast(B.CreateFNeg(Mul)); + } else + FMul = cast(B.CreateFMul(FDiv, FSqrt)); + FMul->copyMetadata(*X); + FMul->copyFastMathFlags(FastMathFlags::intersectRewrite(R1FMF, R2FMF) | + FastMathFlags::unionValue(R1FMF, R2FMF)); + return IC->replaceInstUsesWith(*X, FMul); +} + Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { Module *M = I.getModule(); @@ -1937,6 +2094,24 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) { return R; Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + // Convert + // x = 1.0/sqrt(a) + // r1 = x * x; + // r2 = a/sqrt(a); + // + // TO + // + // r1 = 1/a + // r2 = sqrt(a) + // x = r1 * r2 + SmallPtrSet R1, R2; + if (isFSqrtDivToFMulLegal(&I, R1, R2)) { + CallInst *CI = cast(I.getOperand(1)); + if (Instruction *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, Builder, this)) + return D; + } + if (isa(Op0)) if (SelectInst *SI = dyn_cast(Op1)) if (Instruction *R = FoldOpIntoSelect(I, SI)) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 2fb60ef11499c..fb21576722461 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -281,28 +281,33 @@ bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const { // Return true, if No Signed Wrap should be maintained for I. // The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C", // where both B and C should be ConstantInts, results in a constant that does -// not overflow. This function only handles the Add and Sub opcodes. For +// not overflow. This function only handles the Add/Sub/Mul opcodes. For // all other opcodes, the function conservatively returns false. static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { auto *OBO = dyn_cast(&I); if (!OBO || !OBO->hasNoSignedWrap()) return false; - // We reason about Add and Sub Only. - Instruction::BinaryOps Opcode = I.getOpcode(); - if (Opcode != Instruction::Add && Opcode != Instruction::Sub) - return false; - const APInt *BVal, *CVal; if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal))) return false; + // We reason about Add/Sub/Mul Only. bool Overflow = false; - if (Opcode == Instruction::Add) + switch (I.getOpcode()) { + case Instruction::Add: (void)BVal->sadd_ov(*CVal, Overflow); - else + break; + case Instruction::Sub: (void)BVal->ssub_ov(*CVal, Overflow); - + break; + case Instruction::Mul: + (void)BVal->smul_ov(*CVal, Overflow); + break; + default: + // Conservatively return false for other opcodes. + return false; + } return !Overflow; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 99f6a8860f0f4..6df11abda9e98 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1447,9 +1447,11 @@ class LoopVectorizationCostModel { // Override forced styles if needed. // FIXME: use actual opcode/data type for analysis here. // FIXME: Investigate opportunity for fixed vector factor. - bool EVLIsLegal = UserIC <= 1 && - TTI.hasActiveVectorLength(0, nullptr, Align()) && - !EnableVPlanNativePath; + bool EVLIsLegal = + UserIC <= 1 && TTI.hasActiveVectorLength(0, nullptr, Align()) && + !EnableVPlanNativePath && + // FIXME: remove this once fixed-ordered recurrence is supported. + Legal->getFixedOrderRecurrences().empty(); if (!EVLIsLegal) { // If for some reason EVL mode is unsupported, fallback to // DataWithoutLaneMask to try to vectorize the loop with folded tail @@ -3503,10 +3505,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; - // We currently only know how to emit interleave/deinterleave with - // Factor=2 for scalable vectors. This is purely an implementation - // limit. - if (VF.isScalable() && InterleaveFactor != 2) + // For scalable vectors, the only interleave factor currently supported + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor)) return false; // If the group involves a non-integral pointer, we may not be able to @@ -9433,9 +9435,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported - // is 2 since we require the (de)interleave2 intrinsics instead of - // shufflevectors. - assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) && "Unsupported interleave factor for scalable vectors"); return Result; }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 979a8e0768a99..5ae2f43e4950c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2863,10 +2863,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, - Vals, - /*FMFSource=*/nullptr, Name); + assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " + "scalable vectors, must be power of 2"); + SmallVector InterleavingValues(Vals); + // When interleaving, the number of values will be shrunk until we have the + // single final interleaved value. + auto *InterleaveTy = cast(InterleavingValues[0]->getType()); + for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { + InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); + for (unsigned I = 0; I < Midpoint; ++I) + InterleavingValues[I] = Builder.CreateIntrinsic( + InterleaveTy, Intrinsic::vector_interleave2, + {InterleavingValues[I], InterleavingValues[Midpoint + I]}, + /*FMFSource=*/nullptr, Name); + } + return InterleavingValues[0]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2952,15 +2963,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); - SmallVector Ops = {ResBlockInMask, ResBlockInMask}; - auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), - State.VF.getKnownMinValue() * 2, true); - return State.Builder.CreateIntrinsic( - MaskTy, Intrinsic::vector_interleave2, Ops, - /*FMFSource=*/nullptr, "interleaved.mask"); + SmallVector Ops(InterleaveFactor, ResBlockInMask); + return interleaveVectors(State.Builder, Ops, "interleaved.mask"); } if (!BlockInMask) @@ -3000,22 +3007,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { ArrayRef VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoad, - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + SmallVector DeinterleavedValues(InterleaveFactor); + DeinterleavedValues[0] = NewLoad; + // For the case of InterleaveFactor > 2, we will have to do recursive + // deinterleaving, because the current available deinterleave intrinsic + // supports only Factor of 2, otherwise it will bailout after first + // iteration. + // When deinterleaving, the number of values will double until we + // have "InterleaveFactor". + for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; + NumVectors *= 2) { + // Deinterleave the elements within the vector + SmallVector TempDeinterleavedValues(NumVectors); + for (unsigned I = 0; I < NumVectors; ++I) { + auto *DiTy = DeinterleavedValues[I]->getType(); + TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], + /*FMFSource=*/nullptr, "strided.vec"); + } + // Extract the deinterleaved values: + for (unsigned I = 0; I < 2; ++I) + for (unsigned J = 0; J < NumVectors; ++J) + DeinterleavedValues[NumVectors * I + J] = + State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); + } - if (!Member) +#ifndef NDEBUG + for (Value *Val : DeinterleavedValues) + assert(Val && "NULL Deinterleaved Value"); +#endif + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + Value *StridedVec = DeinterleavedValues[I]; + if (!Member) { + // This value is not needed as it's not used + cast(StridedVec)->eraseFromParent(); continue; - - Value *StridedVec = State.Builder.CreateExtractValue(DI, I); + } // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); diff --git a/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll new file mode 100644 index 0000000000000..7ab6221d0da53 --- /dev/null +++ b/llvm/test/Analysis/ScalarEvolution/implied-via-addition.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -disable-output -passes="print" \ +; RUN: -scalar-evolution-classify-expressions=0 2>&1 | FileCheck %s + +define void @implied1(i32 %n) { +; Prove that (n s> 1) ===> (n - 1 s> 0). +; CHECK-LABEL: 'implied1' +; CHECK-NEXT: Determining loop execution counts for: @implied1 +; CHECK-NEXT: Loop %header: backedge-taken count is (-2 + %n) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 2147483645 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-2 + %n) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; +entry: + %cmp1 = icmp sgt i32 %n, 1 + %n.minus.1 = sub nsw i32 %n, 1 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %exitcond = icmp sgt i32 %n.minus.1, %indvar.next + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} + +define void @implied1_neg(i32 %n) { +; Prove that (n s> 0) =\=> (n - 1 s> 0). +; CHECK-LABEL: 'implied1_neg' +; CHECK-NEXT: Determining loop execution counts for: @implied1_neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax (-1 + %n))) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 2147483645 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax (-1 + %n))) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; +entry: + %cmp1 = icmp sgt i32 %n, 0 + %n.minus.1 = sub nsw i32 %n, 1 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %exitcond = icmp sgt i32 %n.minus.1, %indvar.next + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} + +define void @implied2(i32 %n) { +; Prove that (n u>= -1) ===> (n + 1 u>= 0). +; CHECK-LABEL: 'implied2' +; CHECK-NEXT: Determining loop execution counts for: @implied2 +; CHECK-NEXT: Loop %header: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %header: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %header: Predicated backedge-taken count is (1 + (zext i32 %n to i64)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: +; CHECK-NEXT: Loop %header: Predicated constant max backedge-taken count is i64 4294967296 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: +; CHECK-NEXT: Loop %header: Predicated symbolic max backedge-taken count is (1 + (zext i32 %n to i64)) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: {1,+,1}<%header> Added Flags: +; +entry: + %cmp1 = icmp uge i32 %n, -1 + %n.1 = add nuw i32 %n, 1 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %exitcond = icmp uge i32 %n.1, %indvar.next + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} + +define void @implied2_neg(i32 %n) { +; Prove that (n u>= -1) =\=> (n - 1 s>= 0). +; CHECK-LABEL: 'implied2_neg' +; CHECK-NEXT: Determining loop execution counts for: @implied2_neg +; CHECK-NEXT: Loop %header: backedge-taken count is (-1 + (1 smax %n)) +; CHECK-NEXT: Loop %header: constant max backedge-taken count is i32 2147483646 +; CHECK-NEXT: Loop %header: symbolic max backedge-taken count is (-1 + (1 smax %n)) +; CHECK-NEXT: Loop %header: Trip multiple is 1 +; +entry: + %cmp1 = icmp uge i32 %n, -1 + %n.minus.1 = sub nuw nsw i32 %n, 1 + call void @llvm.assume(i1 %cmp1) + br label %header + +header: + %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ] + %indvar.next = add i32 %indvar, 1 + %exitcond = icmp sge i32 %n.minus.1, %indvar.next + br i1 %exitcond, label %header, label %exit + +exit: + ret void +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir index 253e6ebe793ce..76fdfd0c301f6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir @@ -6,15 +6,15 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v16s8 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8) - ; CHECK: $w0 = COPY [[ANYEXT]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[COPY]](p0) :: (load (<16 x s8>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s8) = G_VECREDUCE_ADD [[LOAD]](<16 x s8>) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s8) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(p0) = COPY $x0 %1:_(<16 x s8>) = G_LOAD %0(p0) :: (load (<16 x s8>)) %2:_(s8) = G_VECREDUCE_ADD %1(<16 x s8>) @@ -29,15 +29,15 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v8s16 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16) - ; CHECK: $w0 = COPY [[ANYEXT]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p0) :: (load (<8 x s16>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s16) = G_VECREDUCE_ADD [[LOAD]](<8 x s16>) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[VECREDUCE_ADD]](s16) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(p0) = COPY $x0 %1:_(<8 x s16>) = G_LOAD %0(p0) :: (load (<8 x s16>)) %2:_(s16) = G_VECREDUCE_ADD %1(<8 x s16>) @@ -52,14 +52,14 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v4s32 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(p0) = COPY $x0 %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load (<4 x s32>)) %2:_(s32) = G_VECREDUCE_ADD %1(<4 x s32>) @@ -73,14 +73,14 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v2s64 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>) - ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[LOAD]](<2 x s64>) + ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(p0) = COPY $x0 %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load (<2 x s64>)) %2:_(s64) = G_VECREDUCE_ADD %1(<2 x s64>) @@ -94,14 +94,14 @@ tracksRegLiveness: true body: | bb.1: liveins: $x0 - ; CHECK-LABEL: name: add_v2s32 ; CHECK: liveins: $x0 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>)) + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[LOAD]](<2 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(p0) = COPY $x0 %1:_(<2 x s32>) = G_LOAD %0(p0) :: (load (<2 x s32>)) %2:_(s32) = G_VECREDUCE_ADD %1(<2 x s32>) @@ -111,24 +111,25 @@ body: | ... --- name: test_v8i64 +# This is a power-of-2 legalization, so use a tree reduction. alignment: 4 tracksRegLiveness: true body: | bb.1: liveins: $q0, $q1, $q2, $q3 - ; This is a power-of-2 legalization, so use a tree reduction. ; CHECK-LABEL: name: test_v8i64 ; CHECK: liveins: $q0, $q1, $q2, $q3 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 - ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 - ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 - ; CHECK: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]] - ; CHECK: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]] - ; CHECK: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]] - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>) - ; CHECK: $x0 = COPY [[VECREDUCE_ADD]](s64) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]] + ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]] + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>) + ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(<2 x s64>) = COPY $q0 %1:_(<2 x s64>) = COPY $q1 %2:_(<2 x s64>) = COPY $q2 @@ -143,25 +144,26 @@ body: | ... --- name: test_v6i64 +# This is a non-power-of-2 legalization, generate multiple vector reductions +# and combine them with scalar ops. alignment: 4 tracksRegLiveness: true body: | bb.1: liveins: $q0, $q1, $q2, $q3 - ; This is a non-power-of-2 legalization, generate multiple vector reductions - ; and combine them with scalar ops. ; CHECK-LABEL: name: test_v6i64 ; CHECK: liveins: $q0, $q1, $q2, $q3 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 - ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>) - ; CHECK: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>) - ; CHECK: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>) - ; CHECK: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]] - ; CHECK: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]] - ; CHECK: $x0 = COPY [[ADD1]](s64) - ; CHECK: RET_ReallyLR implicit $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]] + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]] + ; CHECK-NEXT: $x0 = COPY [[ADD1]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 %0:_(<2 x s64>) = COPY $q0 %1:_(<2 x s64>) = COPY $q1 %2:_(<2 x s64>) = COPY $q2 diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index def4192b0e005..aba284b4e0d29 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -1,8 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG -; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=aarch64 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -global-isel-abort=2 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI -; Function Attrs: nounwind readnone declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>) declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) @@ -23,14 +22,14 @@ declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>) declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128>) -; GISEL: warning: Instruction selection used fallback path for addv_v2i8 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v3i8 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v4i8 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v2i16 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v3i16 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v3i32 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v3i64 -; GISEL-NEXT: warning: Instruction selection used fallback path for addv_v2i128 +; CHECK-GI: warning: Instruction selection used fallback path for addv_v2i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v4i8 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v2i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i64 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v2i128 define i8 @add_B(ptr %arr) { @@ -83,34 +82,34 @@ define i64 @add_D(ptr %arr) { define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias nocapture readonly %arg2) { -; SDAG-LABEL: oversized_ADDV_256: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: ldr d0, [x0] -; SDAG-NEXT: ldr d1, [x1] -; SDAG-NEXT: uabdl v0.8h, v0.8b, v1.8b -; SDAG-NEXT: uaddlv s0, v0.8h -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: oversized_ADDV_256: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: ldr d0, [x0] +; CHECK-SD-NEXT: ldr d1, [x1] +; CHECK-SD-NEXT: uabdl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: uaddlv s0, v0.8h +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: oversized_ADDV_256: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: ldr d1, [x0] -; GISEL-NEXT: ldr d2, [x1] -; GISEL-NEXT: movi v0.2d, #0000000000000000 -; GISEL-NEXT: usubl v1.8h, v1.8b, v2.8b -; GISEL-NEXT: sshll v2.4s, v1.4h, #0 -; GISEL-NEXT: sshll2 v3.4s, v1.8h, #0 -; GISEL-NEXT: ssubw2 v0.4s, v0.4s, v1.8h -; GISEL-NEXT: cmlt v4.4s, v2.4s, #0 -; GISEL-NEXT: cmlt v5.4s, v3.4s, #0 -; GISEL-NEXT: neg v6.4s, v2.4s -; GISEL-NEXT: mov v1.16b, v4.16b -; GISEL-NEXT: bif v0.16b, v3.16b, v5.16b -; GISEL-NEXT: bsl v1.16b, v6.16b, v2.16b -; GISEL-NEXT: add v0.4s, v1.4s, v0.4s -; GISEL-NEXT: addv s0, v0.4s -; GISEL-NEXT: fmov w0, s0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: oversized_ADDV_256: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: ldr d2, [x1] +; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 +; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b +; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0 +; CHECK-GI-NEXT: ssubw2 v0.4s, v0.4s, v1.8h +; CHECK-GI-NEXT: cmlt v4.4s, v2.4s, #0 +; CHECK-GI-NEXT: cmlt v5.4s, v3.4s, #0 +; CHECK-GI-NEXT: neg v6.4s, v2.4s +; CHECK-GI-NEXT: mov v1.16b, v4.16b +; CHECK-GI-NEXT: bif v0.16b, v3.16b, v5.16b +; CHECK-GI-NEXT: bsl v1.16b, v6.16b, v2.16b +; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret entry: %0 = load <8 x i8>, ptr %arg1, align 1 %1 = zext <8 x i8> %0 to <8 x i32> @@ -127,48 +126,48 @@ entry: declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) define i32 @oversized_ADDV_512(ptr %arr) { -; SDAG-LABEL: oversized_ADDV_512: -; SDAG: // %bb.0: -; SDAG-NEXT: ldp q0, q1, [x0, #32] -; SDAG-NEXT: ldp q2, q3, [x0] -; SDAG-NEXT: add v1.4s, v3.4s, v1.4s -; SDAG-NEXT: add v0.4s, v2.4s, v0.4s -; SDAG-NEXT: add v0.4s, v0.4s, v1.4s -; SDAG-NEXT: addv s0, v0.4s -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: oversized_ADDV_512: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldp q0, q1, [x0, #32] +; CHECK-SD-NEXT: ldp q2, q3, [x0] +; CHECK-SD-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: oversized_ADDV_512: -; GISEL: // %bb.0: -; GISEL-NEXT: ldp q0, q1, [x0] -; GISEL-NEXT: ldp q2, q3, [x0, #32] -; GISEL-NEXT: add v0.4s, v0.4s, v1.4s -; GISEL-NEXT: add v1.4s, v2.4s, v3.4s -; GISEL-NEXT: add v0.4s, v0.4s, v1.4s -; GISEL-NEXT: addv s0, v0.4s -; GISEL-NEXT: fmov w0, s0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: oversized_ADDV_512: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldp q0, q1, [x0] +; CHECK-GI-NEXT: ldp q2, q3, [x0, #32] +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %bin.rdx = load <16 x i32>, ptr %arr %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx) ret i32 %r } define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) { -; SDAG-LABEL: addv_combine_i8: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.8b, v0.8b, v1.8b -; SDAG-NEXT: addv b0, v0.8b -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: addv_combine_i8: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: addv b0, v0.8b +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: addv_combine_i8: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addv b0, v0.8b -; GISEL-NEXT: addv b1, v1.8b -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: add w0, w9, w8, uxtb -; GISEL-NEXT: ret +; CHECK-GI-LABEL: addv_combine_i8: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: addv b0, v0.8b +; CHECK-GI-NEXT: addv b1, v1.8b +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w9, w8, uxtb +; CHECK-GI-NEXT: ret entry: %rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1) %rdx.2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a2) @@ -177,21 +176,21 @@ entry: } define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) { -; SDAG-LABEL: addv_combine_i16: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.4h, v0.4h, v1.4h -; SDAG-NEXT: addv h0, v0.4h -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: addv_combine_i16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: addv h0, v0.4h +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: addv_combine_i16: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addv h0, v0.4h -; GISEL-NEXT: addv h1, v1.4h -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: add w0, w9, w8, uxth -; GISEL-NEXT: ret +; CHECK-GI-LABEL: addv_combine_i16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: addv h0, v0.4h +; CHECK-GI-NEXT: addv h1, v1.4h +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w9, w8, uxth +; CHECK-GI-NEXT: ret entry: %rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1) %rdx.2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2) @@ -200,21 +199,21 @@ entry: } define i32 @addv_combine_i32(<4 x i32> %a1, <4 x i32> %a2) { -; SDAG-LABEL: addv_combine_i32: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.4s, v0.4s, v1.4s -; SDAG-NEXT: addv s0, v0.4s -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: addv_combine_i32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: addv s0, v0.4s +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: addv_combine_i32: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addv s0, v0.4s -; GISEL-NEXT: addv s1, v1.4s -; GISEL-NEXT: fmov w8, s0 -; GISEL-NEXT: fmov w9, s1 -; GISEL-NEXT: add w0, w8, w9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: addv_combine_i32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: add w0, w8, w9 +; CHECK-GI-NEXT: ret entry: %rdx.1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a1) %rdx.2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2) @@ -223,21 +222,21 @@ entry: } define i64 @addv_combine_i64(<2 x i64> %a1, <2 x i64> %a2) { -; SDAG-LABEL: addv_combine_i64: -; SDAG: // %bb.0: // %entry -; SDAG-NEXT: add v0.2d, v0.2d, v1.2d -; SDAG-NEXT: addp d0, v0.2d -; SDAG-NEXT: fmov x0, d0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: addv_combine_i64: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: addp d0, v0.2d +; CHECK-SD-NEXT: fmov x0, d0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: addv_combine_i64: -; GISEL: // %bb.0: // %entry -; GISEL-NEXT: addp d0, v0.2d -; GISEL-NEXT: addp d1, v1.2d -; GISEL-NEXT: fmov x8, d0 -; GISEL-NEXT: fmov x9, d1 -; GISEL-NEXT: add x0, x8, x9 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: addv_combine_i64: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: addp d0, v0.2d +; CHECK-GI-NEXT: addp d1, v1.2d +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: add x0, x8, x9 +; CHECK-GI-NEXT: ret entry: %rdx.1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1) %rdx.2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a2) @@ -471,3 +470,6 @@ entry: ret i128 %arg1 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll index b498611242d46..d69d1b6eb4a2a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SDAG -; RUN: llc < %s -global-isel -global-isel-abort=1 -pass-remarks-missed=gisel* -mtriple=arm64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL,FALLBACK +; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=arm64-linux-gnu -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI %0 = type { i64, i64 } @@ -39,22 +39,21 @@ declare i32 @llvm.aarch64.stxp(i64, i64, ptr) nounwind @var = dso_local global i64 0, align 8 -; FALLBACK-NOT: remark:{{.*}}test_load_i8 define dso_local void @test_load_i8(ptr %addr) { -; SDAG-LABEL: test_load_i8: -; SDAG: // %bb.0: -; SDAG-NEXT: ldxrb w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldxrb w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_i8: -; GISEL: // %bb.0: -; GISEL-NEXT: ldxrb w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: and x9, x9, #0xff -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldxrb w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: and x9, x9, #0xff +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i8) %addr) %shortval = trunc i64 %val to i8 @@ -63,22 +62,21 @@ define dso_local void @test_load_i8(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_i16 define dso_local void @test_load_i16(ptr %addr) { -; SDAG-LABEL: test_load_i16: -; SDAG: // %bb.0: -; SDAG-NEXT: ldxrh w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldxrh w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_i16: -; GISEL: // %bb.0: -; GISEL-NEXT: ldxrh w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: and x9, x9, #0xffff -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldxrh w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: and x9, x9, #0xffff +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i16) %addr) %shortval = trunc i64 %val to i16 @@ -87,22 +85,21 @@ define dso_local void @test_load_i16(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_i32 define dso_local void @test_load_i32(ptr %addr) { -; SDAG-LABEL: test_load_i32: -; SDAG: // %bb.0: -; SDAG-NEXT: ldxr w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldxr w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_i32: -; GISEL: // %bb.0: -; GISEL-NEXT: ldxr w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: mov w9, w9 -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldxr w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: mov w9, w9 +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldxr.p0(ptr elementtype(i32) %addr) %shortval = trunc i64 %val to i32 @@ -111,7 +108,6 @@ define dso_local void @test_load_i32(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_i64 define dso_local void @test_load_i64(ptr %addr) { ; CHECK-LABEL: test_load_i64: ; CHECK: // %bb.0: @@ -128,7 +124,6 @@ define dso_local void @test_load_i64(ptr %addr) { declare i64 @llvm.aarch64.ldxr.p0(ptr) nounwind -; FALLBACK-NOT: remark:{{.*}}test_store_i8 define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) { ; CHECK-LABEL: test_store_i8: ; CHECK: // %bb.0: @@ -140,7 +135,6 @@ define dso_local i32 @test_store_i8(i32, i8 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_i16 define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) { ; CHECK-LABEL: test_store_i16: ; CHECK: // %bb.0: @@ -152,7 +146,6 @@ define dso_local i32 @test_store_i16(i32, i16 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_i32 define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) { ; CHECK-LABEL: test_store_i32: ; CHECK: // %bb.0: @@ -163,7 +156,6 @@ define dso_local i32 @test_store_i32(i32, i32 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_i64 define dso_local i32 @test_store_i64(i32, i64 %val, ptr %addr) { ; CHECK-LABEL: test_store_i64: ; CHECK: // %bb.0: @@ -219,22 +211,21 @@ entry: declare %0 @llvm.aarch64.ldaxp(ptr) nounwind declare i32 @llvm.aarch64.stlxp(i64, i64, ptr) nounwind -; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i8 define dso_local void @test_load_acquire_i8(ptr %addr) { -; SDAG-LABEL: test_load_acquire_i8: -; SDAG: // %bb.0: -; SDAG-NEXT: ldaxrb w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_acquire_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldaxrb w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_acquire_i8: -; GISEL: // %bb.0: -; GISEL-NEXT: ldaxrb w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: and x9, x9, #0xff -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_acquire_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldaxrb w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: and x9, x9, #0xff +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i8) %addr) %shortval = trunc i64 %val to i8 @@ -243,22 +234,21 @@ define dso_local void @test_load_acquire_i8(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i16 define dso_local void @test_load_acquire_i16(ptr %addr) { -; SDAG-LABEL: test_load_acquire_i16: -; SDAG: // %bb.0: -; SDAG-NEXT: ldaxrh w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_acquire_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldaxrh w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_acquire_i16: -; GISEL: // %bb.0: -; GISEL-NEXT: ldaxrh w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: and x9, x9, #0xffff -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_acquire_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldaxrh w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: and x9, x9, #0xffff +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i16) %addr) %shortval = trunc i64 %val to i16 @@ -267,22 +257,21 @@ define dso_local void @test_load_acquire_i16(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i32 define dso_local void @test_load_acquire_i32(ptr %addr) { -; SDAG-LABEL: test_load_acquire_i32: -; SDAG: // %bb.0: -; SDAG-NEXT: ldaxr w8, [x0] -; SDAG-NEXT: adrp x9, var -; SDAG-NEXT: str x8, [x9, :lo12:var] -; SDAG-NEXT: ret +; CHECK-SD-LABEL: test_load_acquire_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ldaxr w8, [x0] +; CHECK-SD-NEXT: adrp x9, var +; CHECK-SD-NEXT: str x8, [x9, :lo12:var] +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: test_load_acquire_i32: -; GISEL: // %bb.0: -; GISEL-NEXT: ldaxr w9, [x0] -; GISEL-NEXT: adrp x8, var -; GISEL-NEXT: mov w9, w9 -; GISEL-NEXT: str x9, [x8, :lo12:var] -; GISEL-NEXT: ret +; CHECK-GI-LABEL: test_load_acquire_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ldaxr w9, [x0] +; CHECK-GI-NEXT: adrp x8, var +; CHECK-GI-NEXT: mov w9, w9 +; CHECK-GI-NEXT: str x9, [x8, :lo12:var] +; CHECK-GI-NEXT: ret %val = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i32) %addr) %shortval = trunc i64 %val to i32 @@ -291,7 +280,6 @@ define dso_local void @test_load_acquire_i32(ptr %addr) { ret void } -; FALLBACK-NOT: remark:{{.*}}test_load_acquire_i64 define dso_local void @test_load_acquire_i64(ptr %addr) { ; CHECK-LABEL: test_load_acquire_i64: ; CHECK: // %bb.0: @@ -308,7 +296,6 @@ define dso_local void @test_load_acquire_i64(ptr %addr) { declare i64 @llvm.aarch64.ldaxr.p0(ptr) nounwind -; FALLBACK-NOT: remark:{{.*}}test_store_release_i8 define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) { ; CHECK-LABEL: test_store_release_i8: ; CHECK: // %bb.0: @@ -320,7 +307,6 @@ define dso_local i32 @test_store_release_i8(i32, i8 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_release_i16 define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) { ; CHECK-LABEL: test_store_release_i16: ; CHECK: // %bb.0: @@ -332,7 +318,6 @@ define dso_local i32 @test_store_release_i16(i32, i16 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_release_i32 define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) { ; CHECK-LABEL: test_store_release_i32: ; CHECK: // %bb.0: @@ -343,7 +328,6 @@ define dso_local i32 @test_store_release_i32(i32, i32 %val, ptr %addr) { ret i32 %res } -; FALLBACK-NOT: remark:{{.*}}test_store_release_i64 define dso_local i32 @test_store_release_i64(i32, i64 %val, ptr %addr) { ; CHECK-LABEL: test_store_release_i64: ; CHECK: // %bb.0: @@ -378,5 +362,3 @@ define dso_local i32 @test_stxp_undef_inline_asm(ptr %p, i64 %x) nounwind { } declare i32 @llvm.aarch64.stlxr.p0(i64, ptr) nounwind -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; FALLBACK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll new file mode 100644 index 0000000000000..92e15e78d8c41 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/half-precision-signof-no-assert.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +; Check that the following does not crash +; See https://github.com/llvm/llvm-project/issues/123029 for details + +define ptr @fn(ptr %in, ptr %out) { +; CHECK-LABEL: fn: +; CHECK: // %bb.0: // %fn +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: movi v0.4h, #60, lsl #8 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: fcvtl v1.4s, v1.4h +; CHECK-NEXT: fcmgt v2.4s, v1.4s, #0.0 +; CHECK-NEXT: fcmlt v1.4s, v1.4s, #0.0 +; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr h2, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: xtn v1.4h, v1.4s +; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ldr h0, [x0, #8] +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: fcvt s0, h0 +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: fcsel s1, s2, s1, mi +; CHECK-NEXT: fcsel s1, s2, s1, gt +; CHECK-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: fcvt h0, s0 +; CHECK-NEXT: str h0, [x1, #8] +; CHECK-NEXT: ret +fn: + %1 = load <4 x half>, ptr %in + %2 = fcmp one <4 x half> %1, zeroinitializer + %3 = uitofp <4 x i1> %2 to <4 x half> + store <4 x half> %3, ptr %out + + %4 = getelementptr inbounds nuw i8, ptr %in, i64 8 + %5 = load half, ptr %4 + %6 = fcmp one half %5, 0xH0000 + %7 = uitofp i1 %6 to half + %8 = call half @llvm.copysign.f16(half %7, half %5) + %9 = getelementptr inbounds nuw i8, ptr %out, i64 8 + store half %8, ptr %9 + ret ptr null +} diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 7f3c1fdc93380..c9fe258f11556 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -1,86 +1,87 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,SDAG -; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -aarch64-enable-collect-loh=false -global-isel -global-isel-abort=2 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; Basic tests from input vector to bitmask ; IR generated from clang for: ; __builtin_convertvector + reinterpret_cast -; GISEL: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4 -; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32 -; GISEL-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2 -; GISEL-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat +; CHECK-GI: warning: Instruction selection used fallback path for convert_to_bitmask2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for clang_builtins_undef_concat_convert_to_bitmask4 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_2xi32 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for convert_to_bitmask_8xi2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for no_direct_convert_for_bad_concat define i16 @convert_to_bitmask16(<16 x i8> %vec) { ; Bits used in mask -; SDAG-LABEL: convert_to_bitmask16: -; SDAG: ; %bb.0: -; SDAG-NEXT: adrp x8, lCPI0_0@PAGE -; SDAG-NEXT: cmeq.16b v0, v0, #0 -; SDAG-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: ext.16b v1, v0, v0, #8 -; SDAG-NEXT: zip1.16b v0, v0, v1 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask16: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: adrp x8, lCPI0_0@PAGE +; CHECK-SD-NEXT: cmeq.16b v0, v0, #0 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask16: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.16b v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w10, v0[2] -; GISEL-NEXT: umov.b w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[6] -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[8] -; GISEL-NEXT: orr w8, w8, w9, lsl #6 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[9] -; GISEL-NEXT: orr w8, w8, w9, lsl #7 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[10] -; GISEL-NEXT: orr w8, w8, w9, lsl #8 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[11] -; GISEL-NEXT: orr w8, w8, w9, lsl #9 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[12] -; GISEL-NEXT: orr w8, w8, w9, lsl #10 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[13] -; GISEL-NEXT: orr w8, w8, w9, lsl #11 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[14] -; GISEL-NEXT: orr w8, w8, w9, lsl #12 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[15] -; GISEL-NEXT: orr w8, w8, w9, lsl #13 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #14 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #15 -; GISEL-NEXT: strh w8, [sp, #14] -; GISEL-NEXT: and w0, w8, #0xffff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask16: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.16b v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[8] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[9] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[10] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[11] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[12] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[13] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[14] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[15] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #13 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #15 +; CHECK-GI-NEXT: strh w8, [sp, #14] +; CHECK-GI-NEXT: and w0, w8, #0xffff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret ; Actual conversion @@ -90,50 +91,50 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) { } define i16 @convert_to_bitmask8(<8 x i16> %vec) { -; SDAG-LABEL: convert_to_bitmask8: -; SDAG: ; %bb.0: -; SDAG-NEXT: adrp x8, lCPI1_0@PAGE -; SDAG-NEXT: cmeq.8h v0, v0, #0 -; SDAG-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: fmov w8, s0 -; SDAG-NEXT: and w0, w8, #0xff -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask8: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: adrp x8, lCPI1_0@PAGE +; CHECK-SD-NEXT: cmeq.8h v0, v0, #0 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: and w0, w8, #0xff +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask8: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.8h v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: xtn.8b v0, v0 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w10, v0[2] -; GISEL-NEXT: umov.b w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[6] -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #6 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #7 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask8: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.8h v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: xtn.8b v0, v0 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer @@ -143,36 +144,36 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) { } define i4 @convert_to_bitmask4(<4 x i32> %vec) { -; SDAG-LABEL: convert_to_bitmask4: -; SDAG: ; %bb.0: -; SDAG-NEXT: adrp x8, lCPI2_0@PAGE -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask4: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: adrp x8, lCPI2_0@PAGE +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask4: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask4: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer @@ -220,37 +221,37 @@ define i8 @clang_builtins_undef_concat_convert_to_bitmask4(<4 x i32> %vec) { define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_no_compare: -; SDAG: ; %bb.0: -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: adrp x8, lCPI5_0@PAGE -; SDAG-NEXT: ldr q1, [x8, lCPI5_0@PAGEOFF] -; SDAG-NEXT: shl.4s v0, v0, #31 -; SDAG-NEXT: cmlt.4s v0, v0, #0 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_no_compare: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: adrp x8, lCPI5_0@PAGE +; CHECK-SD-NEXT: ldr q1, [x8, lCPI5_0@PAGEOFF] +; CHECK-SD-NEXT: shl.4s v0, v0, #31 +; CHECK-SD-NEXT: cmlt.4s v0, v0, #0 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_no_compare: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: and.16b v0, v0, v1 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_no_compare: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: and.16b v0, v0, v1 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp = and <4 x i32> %vec1, %vec2 @@ -260,39 +261,39 @@ define i4 @convert_to_bitmask_no_compare(<4 x i32> %vec1, <4 x i32> %vec2) { } define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_with_compare_chain: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmeq.4s v2, v0, #0 -; SDAG-NEXT: cmeq.4s v0, v0, v1 -; SDAG-NEXT: adrp x8, lCPI6_0@PAGE -; SDAG-NEXT: ldr q1, [x8, lCPI6_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v0, v2 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_with_compare_chain: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmeq.4s v2, v0, #0 +; CHECK-SD-NEXT: cmeq.4s v0, v0, v1 +; CHECK-SD-NEXT: adrp x8, lCPI6_0@PAGE +; CHECK-SD-NEXT: ldr q1, [x8, lCPI6_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v0, v2 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_with_compare_chain: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v2, v0, #0 -; GISEL-NEXT: cmeq.4s v0, v0, v1 -; GISEL-NEXT: bic.16b v0, v0, v2 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_with_compare_chain: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v2, v0, #0 +; CHECK-GI-NEXT: cmeq.4s v0, v0, v1 +; CHECK-GI-NEXT: bic.16b v0, v0, v2 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer @@ -303,39 +304,39 @@ define i4 @convert_to_bitmask_with_compare_chain(<4 x i32> %vec1, <4 x i32> %vec } define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_with_trunc_in_chain: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: adrp x8, lCPI7_0@PAGE -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: ldr q1, [x8, lCPI7_0@PAGEOFF] -; SDAG-NEXT: shl.4s v0, v0, #31 -; SDAG-NEXT: cmlt.4s v0, v0, #0 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_with_trunc_in_chain: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: adrp x8, lCPI7_0@PAGE +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI7_0@PAGEOFF] +; CHECK-SD-NEXT: shl.4s v0, v0, #31 +; CHECK-SD-NEXT: cmlt.4s v0, v0, #0 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_with_trunc_in_chain: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: bic.16b v0, v1, v0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_with_trunc_in_chain: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: bic.16b v0, v1, v0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer @@ -346,82 +347,82 @@ define i4 @convert_to_bitmask_with_trunc_in_chain(<4 x i32> %vec1, <4 x i32> %ve } define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: cmeq.4s v1, v1, #0 -; SDAG-NEXT: adrp x8, lCPI8_0@PAGE -; SDAG-NEXT: movi d2, #0x000000ffffffff -; SDAG-NEXT: movi d3, #0x00ffffffffffff -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: movi d1, #0xffff0000ffff0000 -; SDAG-NEXT: xtn.4h v0, v0 -; SDAG-NEXT: orr.8b v0, v0, v2 -; SDAG-NEXT: movi d2, #0x00ffffffff0000 -; SDAG-NEXT: eor.8b v1, v0, v1 -; SDAG-NEXT: eor.8b v0, v0, v2 -; SDAG-NEXT: mov.h v1[2], wzr -; SDAG-NEXT: orr.8b v0, v0, v3 -; SDAG-NEXT: orr.8b v0, v1, v0 -; SDAG-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF] -; SDAG-NEXT: shl.4h v0, v0, #15 -; SDAG-NEXT: cmlt.4h v0, v0, #0 -; SDAG-NEXT: and.8b v0, v0, v1 -; SDAG-NEXT: addv.4h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: cmeq.4s v1, v1, #0 +; CHECK-SD-NEXT: adrp x8, lCPI8_0@PAGE +; CHECK-SD-NEXT: movi d2, #0x000000ffffffff +; CHECK-SD-NEXT: movi d3, #0x00ffffffffffff +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: movi d1, #0xffff0000ffff0000 +; CHECK-SD-NEXT: xtn.4h v0, v0 +; CHECK-SD-NEXT: orr.8b v0, v0, v2 +; CHECK-SD-NEXT: movi d2, #0x00ffffffff0000 +; CHECK-SD-NEXT: eor.8b v1, v0, v1 +; CHECK-SD-NEXT: eor.8b v0, v0, v2 +; CHECK-SD-NEXT: mov.h v1[2], wzr +; CHECK-SD-NEXT: orr.8b v0, v0, v3 +; CHECK-SD-NEXT: orr.8b v0, v1, v0 +; CHECK-SD-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF] +; CHECK-SD-NEXT: shl.4h v0, v0, #15 +; CHECK-SD-NEXT: cmlt.4h v0, v0, #0 +; CHECK-SD-NEXT: and.8b v0, v0, v1 +; CHECK-SD-NEXT: addv.4h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: mov w8, #1 ; =0x1 -; GISEL-NEXT: mov w9, #0 ; =0x0 -; GISEL-NEXT: cmeq.4s v5, v0, #0 -; GISEL-NEXT: fmov s2, w8 -; GISEL-NEXT: fmov s4, w9 -; GISEL-NEXT: cmeq.4s v1, v1, #0 -; GISEL-NEXT: mov.16b v3, v2 -; GISEL-NEXT: mov.16b v0, v4 -; GISEL-NEXT: mov.h v4[1], w8 -; GISEL-NEXT: bic.16b v1, v1, v5 -; GISEL-NEXT: mov.16b v5, v2 -; GISEL-NEXT: mov.h v2[1], w8 -; GISEL-NEXT: mov.h v3[1], w8 -; GISEL-NEXT: mov.h v0[1], w8 -; GISEL-NEXT: mov.h v5[1], w8 -; GISEL-NEXT: mov.h v4[2], w8 -; GISEL-NEXT: xtn.4h v1, v1 -; GISEL-NEXT: mov.h v2[2], w8 -; GISEL-NEXT: mov.h v3[2], w9 -; GISEL-NEXT: mov.h v0[2], w9 -; GISEL-NEXT: mov.h v5[2], w9 -; GISEL-NEXT: mov.h v4[3], w9 -; GISEL-NEXT: mov.h v2[3], w9 -; GISEL-NEXT: mov.h v3[3], w9 -; GISEL-NEXT: mov.h v0[3], w8 -; GISEL-NEXT: mov.h v5[3], w8 -; GISEL-NEXT: orr.8b v1, v1, v3 -; GISEL-NEXT: eor.8b v0, v1, v0 -; GISEL-NEXT: eor.8b v1, v4, v1 -; GISEL-NEXT: and.8b v0, v0, v5 -; GISEL-NEXT: orr.8b v1, v2, v1 -; GISEL-NEXT: orr.8b v0, v0, v1 -; GISEL-NEXT: ushll.4s v0, v0, #0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_with_unknown_type_in_long_chain: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w8, #1 ; =0x1 +; CHECK-GI-NEXT: mov w9, #0 ; =0x0 +; CHECK-GI-NEXT: cmeq.4s v5, v0, #0 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: cmeq.4s v1, v1, #0 +; CHECK-GI-NEXT: mov.16b v3, v2 +; CHECK-GI-NEXT: mov.16b v0, v4 +; CHECK-GI-NEXT: mov.h v4[1], w8 +; CHECK-GI-NEXT: bic.16b v1, v1, v5 +; CHECK-GI-NEXT: mov.16b v5, v2 +; CHECK-GI-NEXT: mov.h v2[1], w8 +; CHECK-GI-NEXT: mov.h v3[1], w8 +; CHECK-GI-NEXT: mov.h v0[1], w8 +; CHECK-GI-NEXT: mov.h v5[1], w8 +; CHECK-GI-NEXT: mov.h v4[2], w8 +; CHECK-GI-NEXT: xtn.4h v1, v1 +; CHECK-GI-NEXT: mov.h v2[2], w8 +; CHECK-GI-NEXT: mov.h v3[2], w9 +; CHECK-GI-NEXT: mov.h v0[2], w9 +; CHECK-GI-NEXT: mov.h v5[2], w9 +; CHECK-GI-NEXT: mov.h v4[3], w9 +; CHECK-GI-NEXT: mov.h v2[3], w9 +; CHECK-GI-NEXT: mov.h v3[3], w9 +; CHECK-GI-NEXT: mov.h v0[3], w8 +; CHECK-GI-NEXT: mov.h v5[3], w8 +; CHECK-GI-NEXT: orr.8b v1, v1, v3 +; CHECK-GI-NEXT: eor.8b v0, v1, v0 +; CHECK-GI-NEXT: eor.8b v1, v4, v1 +; CHECK-GI-NEXT: and.8b v0, v0, v5 +; CHECK-GI-NEXT: orr.8b v1, v2, v1 +; CHECK-GI-NEXT: orr.8b v0, v0, v1 +; CHECK-GI-NEXT: ushll.4s v0, v0, #0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp1 = icmp ne <4 x i32> %vec1, zeroinitializer @@ -440,42 +441,42 @@ define i4 @convert_to_bitmask_with_unknown_type_in_long_chain(<4 x i32> %vec1, < } define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 x i32> %vec2) { -; SDAG-LABEL: convert_to_bitmask_with_different_types_in_chain: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmeq.4s v1, v1, #0 -; SDAG-NEXT: cmeq.4h v0, v0, #0 -; SDAG-NEXT: adrp x8, lCPI9_0@PAGE -; SDAG-NEXT: xtn.4h v1, v1 -; SDAG-NEXT: orn.8b v0, v1, v0 -; SDAG-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF] -; SDAG-NEXT: and.8b v0, v0, v1 -; SDAG-NEXT: addv.4h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_with_different_types_in_chain: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmeq.4s v1, v1, #0 +; CHECK-SD-NEXT: cmeq.4h v0, v0, #0 +; CHECK-SD-NEXT: adrp x8, lCPI9_0@PAGE +; CHECK-SD-NEXT: xtn.4h v1, v1 +; CHECK-SD-NEXT: orn.8b v0, v1, v0 +; CHECK-SD-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF] +; CHECK-SD-NEXT: and.8b v0, v0, v1 +; CHECK-SD-NEXT: addv.4h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_with_different_types_in_chain: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v1, v1, #0 -; GISEL-NEXT: cmeq.4h v0, v0, #0 -; GISEL-NEXT: xtn.4h v1, v1 -; GISEL-NEXT: orn.8b v0, v1, v0 -; GISEL-NEXT: ushll.4s v0, v0, #0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_with_different_types_in_chain: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v1, v1, #0 +; CHECK-GI-NEXT: cmeq.4h v0, v0, #0 +; CHECK-GI-NEXT: xtn.4h v1, v1 +; CHECK-GI-NEXT: orn.8b v0, v1, v0 +; CHECK-GI-NEXT: ushll.4s v0, v0, #0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp1 = icmp ne <4 x i16> %vec1, zeroinitializer @@ -486,73 +487,73 @@ define i4 @convert_to_bitmask_with_different_types_in_chain(<4 x i16> %vec1, <4 } define i16 @convert_to_bitmask_without_knowing_type(<16 x i1> %vec) { -; SDAG-LABEL: convert_to_bitmask_without_knowing_type: -; SDAG: ; %bb.0: -; SDAG-NEXT: shl.16b v0, v0, #7 -; SDAG-NEXT: adrp x8, lCPI10_0@PAGE -; SDAG-NEXT: ldr q1, [x8, lCPI10_0@PAGEOFF] -; SDAG-NEXT: cmlt.16b v0, v0, #0 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: ext.16b v1, v0, v0, #8 -; SDAG-NEXT: zip1.16b v0, v0, v1 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_without_knowing_type: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: shl.16b v0, v0, #7 +; CHECK-SD-NEXT: adrp x8, lCPI10_0@PAGE +; CHECK-SD-NEXT: ldr q1, [x8, lCPI10_0@PAGEOFF] +; CHECK-SD-NEXT: cmlt.16b v0, v0, #0 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_without_knowing_type: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w10, v0[2] -; GISEL-NEXT: umov.b w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[6] -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[8] -; GISEL-NEXT: orr w8, w8, w9, lsl #6 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[9] -; GISEL-NEXT: orr w8, w8, w9, lsl #7 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[10] -; GISEL-NEXT: orr w8, w8, w9, lsl #8 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[11] -; GISEL-NEXT: orr w8, w8, w9, lsl #9 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[12] -; GISEL-NEXT: orr w8, w8, w9, lsl #10 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[13] -; GISEL-NEXT: orr w8, w8, w9, lsl #11 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[14] -; GISEL-NEXT: orr w8, w8, w9, lsl #12 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[15] -; GISEL-NEXT: orr w8, w8, w9, lsl #13 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #14 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #15 -; GISEL-NEXT: strh w8, [sp, #14] -; GISEL-NEXT: and w0, w8, #0xffff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_without_knowing_type: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[8] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[9] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[10] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #8 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[11] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #9 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[12] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #10 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[13] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #11 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[14] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #12 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[15] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #13 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #14 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #15 +; CHECK-GI-NEXT: strh w8, [sp, #14] +; CHECK-GI-NEXT: and w0, w8, #0xffff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %bitmask = bitcast <16 x i1> %vec to i16 ret i16 %bitmask @@ -575,51 +576,51 @@ define i2 @convert_to_bitmask_2xi32(<2 x i32> %vec) { } define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) { -; SDAG-LABEL: convert_to_bitmask_4xi8: -; SDAG: ; %bb.0: -; SDAG-NEXT: bic.4h v0, #255, lsl #8 -; SDAG-NEXT: adrp x8, lCPI12_0@PAGE -; SDAG-NEXT: ldr d1, [x8, lCPI12_0@PAGEOFF] -; SDAG-NEXT: cmeq.4h v0, v0, #0 -; SDAG-NEXT: bic.8b v0, v1, v0 -; SDAG-NEXT: addv.4h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_4xi8: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: bic.4h v0, #255, lsl #8 +; CHECK-SD-NEXT: adrp x8, lCPI12_0@PAGE +; CHECK-SD-NEXT: ldr d1, [x8, lCPI12_0@PAGEOFF] +; CHECK-SD-NEXT: cmeq.4h v0, v0, #0 +; CHECK-SD-NEXT: bic.8b v0, v1, v0 +; CHECK-SD-NEXT: addv.4h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_4xi8: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: mov w8, #0 ; =0x0 -; GISEL-NEXT: uzp1.8b v0, v0, v0 -; GISEL-NEXT: fmov s1, w8 -; GISEL-NEXT: mov.b v1[1], w8 -; GISEL-NEXT: mov.b v1[2], w8 -; GISEL-NEXT: mov.b v1[3], w8 -; GISEL-NEXT: cmeq.8b v0, v0, v1 -; GISEL-NEXT: mvn.8b v0, v0 -; GISEL-NEXT: umov.b w8, v0[0] -; GISEL-NEXT: umov.b w9, v0[1] -; GISEL-NEXT: mov.s v1[0], w8 -; GISEL-NEXT: umov.b w8, v0[2] -; GISEL-NEXT: mov.s v1[1], w9 -; GISEL-NEXT: umov.b w9, v0[3] -; GISEL-NEXT: mov.s v1[2], w8 -; GISEL-NEXT: mov.s v1[3], w9 -; GISEL-NEXT: mov.s w8, v1[1] -; GISEL-NEXT: mov.s w9, v1[2] -; GISEL-NEXT: fmov w11, s1 -; GISEL-NEXT: mov.s w10, v1[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_4xi8: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov w8, #0 ; =0x0 +; CHECK-GI-NEXT: uzp1.8b v0, v0, v0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov.b v1[1], w8 +; CHECK-GI-NEXT: mov.b v1[2], w8 +; CHECK-GI-NEXT: mov.b v1[3], w8 +; CHECK-GI-NEXT: cmeq.8b v0, v0, v1 +; CHECK-GI-NEXT: mvn.8b v0, v0 +; CHECK-GI-NEXT: umov.b w8, v0[0] +; CHECK-GI-NEXT: umov.b w9, v0[1] +; CHECK-GI-NEXT: mov.s v1[0], w8 +; CHECK-GI-NEXT: umov.b w8, v0[2] +; CHECK-GI-NEXT: mov.s v1[1], w9 +; CHECK-GI-NEXT: umov.b w9, v0[3] +; CHECK-GI-NEXT: mov.s v1[2], w8 +; CHECK-GI-NEXT: mov.s v1[3], w9 +; CHECK-GI-NEXT: mov.s w8, v1[1] +; CHECK-GI-NEXT: mov.s w9, v1[2] +; CHECK-GI-NEXT: fmov w11, s1 +; CHECK-GI-NEXT: mov.s w10, v1[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <4 x i8> %vec, zeroinitializer %bitmask = bitcast <4 x i1> %cmp_result to i4 @@ -645,39 +646,39 @@ define i8 @convert_to_bitmask_8xi2(<8 x i2> %vec) { } define i4 @convert_to_bitmask_float(<4 x float> %vec) { -; SDAG-LABEL: convert_to_bitmask_float: -; SDAG: ; %bb.0: -; SDAG-NEXT: fcmgt.4s v1, v0, #0.0 -; SDAG-NEXT: fcmlt.4s v0, v0, #0.0 -; SDAG-NEXT: adrp x8, lCPI14_0@PAGE -; SDAG-NEXT: orr.16b v0, v0, v1 -; SDAG-NEXT: ldr q1, [x8, lCPI14_0@PAGEOFF] -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: addv.4s s0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_to_bitmask_float: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: fcmgt.4s v1, v0, #0.0 +; CHECK-SD-NEXT: fcmlt.4s v0, v0, #0.0 +; CHECK-SD-NEXT: adrp x8, lCPI14_0@PAGE +; CHECK-SD-NEXT: orr.16b v0, v0, v1 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI14_0@PAGEOFF] +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.4s s0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_to_bitmask_float: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: fcmgt.4s v1, v0, #0.0 -; GISEL-NEXT: fcmlt.4s v0, v0, #0.0 -; GISEL-NEXT: orr.16b v0, v0, v1 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_to_bitmask_float: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: fcmgt.4s v1, v0, #0.0 +; CHECK-GI-NEXT: fcmlt.4s v0, v0, #0.0 +; CHECK-GI-NEXT: orr.16b v0, v0, v1 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = fcmp one <4 x float> %vec, zeroinitializer @@ -688,58 +689,58 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) { ; Larger vector types don't map directly, but the can be split/truncated and then converted. ; After the comparison against 0, this is truncated to <8 x i16>, which is valid again. define i8 @convert_large_vector(<8 x i32> %vec) { -; SDAG-LABEL: convert_large_vector: -; SDAG: ; %bb.0: -; SDAG-NEXT: sub sp, sp, #16 -; SDAG-NEXT: .cfi_def_cfa_offset 16 -; SDAG-NEXT: cmeq.4s v1, v1, #0 -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: adrp x8, lCPI15_0@PAGE -; SDAG-NEXT: uzp1.8h v0, v0, v1 -; SDAG-NEXT: ldr q1, [x8, lCPI15_0@PAGEOFF] -; SDAG-NEXT: bic.16b v0, v1, v0 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: fmov w8, s0 -; SDAG-NEXT: and w0, w8, #0xff -; SDAG-NEXT: add sp, sp, #16 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_large_vector: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: cmeq.4s v1, v1, #0 +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: adrp x8, lCPI15_0@PAGE +; CHECK-SD-NEXT: uzp1.8h v0, v0, v1 +; CHECK-SD-NEXT: ldr q1, [x8, lCPI15_0@PAGEOFF] +; CHECK-SD-NEXT: bic.16b v0, v1, v0 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: fmov w8, s0 +; CHECK-SD-NEXT: and w0, w8, #0xff +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_large_vector: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: cmeq.4s v1, v1, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: mvn.16b v1, v1 -; GISEL-NEXT: uzp1.8h v0, v0, v1 -; GISEL-NEXT: xtn.8b v0, v0 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w10, v0[2] -; GISEL-NEXT: umov.b w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: umov.b w10, v0[6] -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #6 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #7 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_large_vector: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: cmeq.4s v1, v1, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: mvn.16b v1, v1 +; CHECK-GI-NEXT: uzp1.8h v0, v0, v1 +; CHECK-GI-NEXT: xtn.8b v0, v0 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #6 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer @@ -748,40 +749,40 @@ define i8 @convert_large_vector(<8 x i32> %vec) { } define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) { -; SDAG-LABEL: convert_legalized_illegal_element_size: -; SDAG: ; %bb.0: -; SDAG-NEXT: movi.4s v1, #63, msl #16 -; SDAG-NEXT: adrp x8, lCPI16_0@PAGE -; SDAG-NEXT: cmtst.4s v0, v0, v1 -; SDAG-NEXT: ldr d1, [x8, lCPI16_0@PAGEOFF] -; SDAG-NEXT: xtn.4h v0, v0 -; SDAG-NEXT: and.8b v0, v0, v1 -; SDAG-NEXT: addv.4h h0, v0 -; SDAG-NEXT: fmov w0, s0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: convert_legalized_illegal_element_size: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: movi.4s v1, #63, msl #16 +; CHECK-SD-NEXT: adrp x8, lCPI16_0@PAGE +; CHECK-SD-NEXT: cmtst.4s v0, v0, v1 +; CHECK-SD-NEXT: ldr d1, [x8, lCPI16_0@PAGEOFF] +; CHECK-SD-NEXT: xtn.4h v0, v0 +; CHECK-SD-NEXT: and.8b v0, v0, v1 +; CHECK-SD-NEXT: addv.4h h0, v0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: convert_legalized_illegal_element_size: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: movi.4s v1, #63, msl #16 -; GISEL-NEXT: and.16b v0, v0, v1 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: fmov w11, s0 -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w11, w8, #1, #31 -; GISEL-NEXT: and w8, w9, #0x1 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w11, w8, lsl #2 -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: convert_legalized_illegal_element_size: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: movi.4s v1, #63, msl #16 +; CHECK-GI-NEXT: and.16b v0, v0, v1 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: fmov w11, s0 +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer %bitmask = bitcast <4 x i1> %cmp_result to i4 @@ -818,101 +819,101 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) { } define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) { -; SDAG-LABEL: no_convert_without_direct_bitcast: -; SDAG: ; %bb.0: -; SDAG-NEXT: cmtst.8h v0, v0, v0 -; SDAG-NEXT: xtn.8b v0, v0 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: no_convert_without_direct_bitcast: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: cmtst.8h v0, v0, v0 +; CHECK-SD-NEXT: xtn.8b v0, v0 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: no_convert_without_direct_bitcast: -; GISEL: ; %bb.0: -; GISEL-NEXT: cmeq.8h v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: xtn.8b v0, v0 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: no_convert_without_direct_bitcast: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: cmeq.8h v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: xtn.8b v0, v0 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer ret <8 x i1> %cmp_result } define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { -; SDAG-LABEL: no_combine_illegal_num_elements: -; SDAG: ; %bb.0: -; SDAG-NEXT: sub sp, sp, #16 -; SDAG-NEXT: .cfi_def_cfa_offset 16 -; SDAG-NEXT: fmov s0, w0 -; SDAG-NEXT: fmov s1, w4 -; SDAG-NEXT: mov.s v0[1], w1 -; SDAG-NEXT: mov.s v1[1], w5 -; SDAG-NEXT: mov.s v0[2], w2 -; SDAG-NEXT: cmeq.4s v1, v1, #0 -; SDAG-NEXT: mov.s v0[3], w3 -; SDAG-NEXT: cmeq.4s v0, v0, #0 -; SDAG-NEXT: uzp1.8h v0, v0, v1 -; SDAG-NEXT: mvn.16b v0, v0 -; SDAG-NEXT: xtn.8b v0, v0 -; SDAG-NEXT: umov.b w8, v0[0] -; SDAG-NEXT: umov.b w9, v0[1] -; SDAG-NEXT: umov.b w10, v0[2] -; SDAG-NEXT: and w8, w8, #0x1 -; SDAG-NEXT: bfi w8, w9, #1, #1 -; SDAG-NEXT: umov.b w9, v0[3] -; SDAG-NEXT: bfi w8, w10, #2, #1 -; SDAG-NEXT: umov.b w10, v0[4] -; SDAG-NEXT: bfi w8, w9, #3, #1 -; SDAG-NEXT: umov.b w9, v0[5] -; SDAG-NEXT: bfi w8, w10, #4, #1 -; SDAG-NEXT: orr w8, w8, w9, lsl #5 -; SDAG-NEXT: and w0, w8, #0x3f -; SDAG-NEXT: add sp, sp, #16 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: no_combine_illegal_num_elements: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: fmov s1, w4 +; CHECK-SD-NEXT: mov.s v0[1], w1 +; CHECK-SD-NEXT: mov.s v1[1], w5 +; CHECK-SD-NEXT: mov.s v0[2], w2 +; CHECK-SD-NEXT: cmeq.4s v1, v1, #0 +; CHECK-SD-NEXT: mov.s v0[3], w3 +; CHECK-SD-NEXT: cmeq.4s v0, v0, #0 +; CHECK-SD-NEXT: uzp1.8h v0, v0, v1 +; CHECK-SD-NEXT: mvn.16b v0, v0 +; CHECK-SD-NEXT: xtn.8b v0, v0 +; CHECK-SD-NEXT: umov.b w8, v0[0] +; CHECK-SD-NEXT: umov.b w9, v0[1] +; CHECK-SD-NEXT: umov.b w10, v0[2] +; CHECK-SD-NEXT: and w8, w8, #0x1 +; CHECK-SD-NEXT: bfi w8, w9, #1, #1 +; CHECK-SD-NEXT: umov.b w9, v0[3] +; CHECK-SD-NEXT: bfi w8, w10, #2, #1 +; CHECK-SD-NEXT: umov.b w10, v0[4] +; CHECK-SD-NEXT: bfi w8, w9, #3, #1 +; CHECK-SD-NEXT: umov.b w9, v0[5] +; CHECK-SD-NEXT: bfi w8, w10, #4, #1 +; CHECK-SD-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-SD-NEXT: and w0, w8, #0x3f +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: no_combine_illegal_num_elements: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: .cfi_def_cfa_offset 16 -; GISEL-NEXT: mov.s v0[0], w0 -; GISEL-NEXT: mov.s v1[0], w4 -; GISEL-NEXT: mov.s v2[0], wzr -; GISEL-NEXT: mov.s v0[1], w1 -; GISEL-NEXT: mov.s v1[1], w5 -; GISEL-NEXT: mov.s v2[1], wzr -; GISEL-NEXT: mov.s v0[2], w2 -; GISEL-NEXT: cmeq.4s v1, v1, v2 -; GISEL-NEXT: mvn.16b v1, v1 -; GISEL-NEXT: mov.s v0[3], w3 -; GISEL-NEXT: cmeq.4s v0, v0, #0 -; GISEL-NEXT: mvn.16b v0, v0 -; GISEL-NEXT: mov.s w8, v0[1] -; GISEL-NEXT: mov.s w9, v0[2] -; GISEL-NEXT: mov.s w10, v0[3] -; GISEL-NEXT: mov.h v0[1], w8 -; GISEL-NEXT: mov.s w8, v1[1] -; GISEL-NEXT: mov.h v0[2], w9 -; GISEL-NEXT: mov.h v0[3], w10 -; GISEL-NEXT: mov.h v0[4], v1[0] -; GISEL-NEXT: mov.h v0[5], w8 -; GISEL-NEXT: umov.h w8, v0[1] -; GISEL-NEXT: umov.h w9, v0[0] -; GISEL-NEXT: umov.h w10, v0[2] -; GISEL-NEXT: umov.h w11, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: umov.h w10, v0[4] -; GISEL-NEXT: orr w8, w9, w8, lsl #2 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: umov.h w11, v0[5] -; GISEL-NEXT: orr w8, w8, w9, lsl #3 -; GISEL-NEXT: and w9, w10, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #4 -; GISEL-NEXT: and w9, w11, #0x1 -; GISEL-NEXT: orr w8, w8, w9, lsl #5 -; GISEL-NEXT: and w8, w8, #0x3f -; GISEL-NEXT: strb w8, [sp, #15] -; GISEL-NEXT: and w0, w8, #0xff -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: no_combine_illegal_num_elements: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: mov.s v0[0], w0 +; CHECK-GI-NEXT: mov.s v1[0], w4 +; CHECK-GI-NEXT: mov.s v2[0], wzr +; CHECK-GI-NEXT: mov.s v0[1], w1 +; CHECK-GI-NEXT: mov.s v1[1], w5 +; CHECK-GI-NEXT: mov.s v2[1], wzr +; CHECK-GI-NEXT: mov.s v0[2], w2 +; CHECK-GI-NEXT: cmeq.4s v1, v1, v2 +; CHECK-GI-NEXT: mvn.16b v1, v1 +; CHECK-GI-NEXT: mov.s v0[3], w3 +; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 +; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: mov.s w8, v0[1] +; CHECK-GI-NEXT: mov.s w9, v0[2] +; CHECK-GI-NEXT: mov.s w10, v0[3] +; CHECK-GI-NEXT: mov.h v0[1], w8 +; CHECK-GI-NEXT: mov.s w8, v1[1] +; CHECK-GI-NEXT: mov.h v0[2], w9 +; CHECK-GI-NEXT: mov.h v0[3], w10 +; CHECK-GI-NEXT: mov.h v0[4], v1[0] +; CHECK-GI-NEXT: mov.h v0[5], w8 +; CHECK-GI-NEXT: umov.h w8, v0[1] +; CHECK-GI-NEXT: umov.h w9, v0[0] +; CHECK-GI-NEXT: umov.h w10, v0[2] +; CHECK-GI-NEXT: umov.h w11, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: umov.h w10, v0[4] +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: umov.h w11, v0[5] +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 +; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 +; CHECK-GI-NEXT: and w8, w8, #0x3f +; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: and w0, w8, #0xff +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer %bitmask = bitcast <6 x i1> %cmp_result to i6 @@ -921,220 +922,220 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { ; Only apply the combine when casting a vector to a scalar. define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind { -; SDAG-LABEL: vector_to_vector_cast: -; SDAG: ; %bb.0: -; SDAG-NEXT: sub sp, sp, #16 -; SDAG-NEXT: shl.16b v0, v0, #7 -; SDAG-NEXT: adrp x8, lCPI20_0@PAGE -; SDAG-NEXT: ldr q1, [x8, lCPI20_0@PAGEOFF] -; SDAG-NEXT: add x8, sp, #14 -; SDAG-NEXT: cmlt.16b v0, v0, #0 -; SDAG-NEXT: and.16b v0, v0, v1 -; SDAG-NEXT: ext.16b v1, v0, v0, #8 -; SDAG-NEXT: zip1.16b v0, v0, v1 -; SDAG-NEXT: addv.8h h0, v0 -; SDAG-NEXT: str h0, [sp, #14] -; SDAG-NEXT: ld1.b { v0 }[0], [x8] -; SDAG-NEXT: orr x8, x8, #0x1 -; SDAG-NEXT: ld1.b { v0 }[4], [x8] -; SDAG-NEXT: ; kill: def $d0 killed $d0 killed $q0 -; SDAG-NEXT: add sp, sp, #16 -; SDAG-NEXT: ret +; CHECK-SD-LABEL: vector_to_vector_cast: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: sub sp, sp, #16 +; CHECK-SD-NEXT: shl.16b v0, v0, #7 +; CHECK-SD-NEXT: adrp x8, lCPI20_0@PAGE +; CHECK-SD-NEXT: ldr q1, [x8, lCPI20_0@PAGEOFF] +; CHECK-SD-NEXT: add x8, sp, #14 +; CHECK-SD-NEXT: cmlt.16b v0, v0, #0 +; CHECK-SD-NEXT: and.16b v0, v0, v1 +; CHECK-SD-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 +; CHECK-SD-NEXT: addv.8h h0, v0 +; CHECK-SD-NEXT: str h0, [sp, #14] +; CHECK-SD-NEXT: ld1.b { v0 }[0], [x8] +; CHECK-SD-NEXT: orr x8, x8, #0x1 +; CHECK-SD-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-SD-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: add sp, sp, #16 +; CHECK-SD-NEXT: ret ; -; GISEL-LABEL: vector_to_vector_cast: -; GISEL: ; %bb.0: -; GISEL-NEXT: sub sp, sp, #16 -; GISEL-NEXT: umov.b w8, v0[1] -; GISEL-NEXT: mov d1, v0[1] -; GISEL-NEXT: umov.b w10, v0[1] -; GISEL-NEXT: umov.b w9, v0[0] -; GISEL-NEXT: umov.b w13, v0[0] -; GISEL-NEXT: umov.b w14, v0[2] -; GISEL-NEXT: umov.b w15, v0[3] -; GISEL-NEXT: umov.b w11, v0[2] -; GISEL-NEXT: umov.b w16, v0[4] -; GISEL-NEXT: umov.b w17, v0[5] -; GISEL-NEXT: umov.b w12, v0[3] -; GISEL-NEXT: and w8, w8, #0x1 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: umov.b w0, v1[1] -; GISEL-NEXT: bfi w9, w8, #1, #31 -; GISEL-NEXT: bfi w13, w10, #1, #31 -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: umov.b w8, v1[0] -; GISEL-NEXT: umov.b w10, v1[2] -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: orr w13, w13, w14, lsl #2 -; GISEL-NEXT: umov.b w14, v1[3] -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: and w0, w0, #0x1 -; GISEL-NEXT: and w16, w16, #0x1 -; GISEL-NEXT: orr w9, w9, w11, lsl #2 -; GISEL-NEXT: orr w13, w13, w15, lsl #3 -; GISEL-NEXT: umov.b w15, v1[4] -; GISEL-NEXT: umov.b w11, v0[6] -; GISEL-NEXT: bfi w8, w0, #1, #31 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: and w17, w17, #0x1 -; GISEL-NEXT: orr w13, w13, w16, lsl #4 -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: umov.b w0, v0[7] -; GISEL-NEXT: orr w8, w8, w10, lsl #2 -; GISEL-NEXT: umov.b w10, v1[5] -; GISEL-NEXT: umov.b w16, v1[6] -; GISEL-NEXT: orr w13, w13, w17, lsl #5 -; GISEL-NEXT: umov.b w17, v0[4] -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: orr w8, w8, w14, lsl #3 -; GISEL-NEXT: and w12, w12, #0x1 -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: umov.b w14, v1[7] -; GISEL-NEXT: orr w9, w9, w12, lsl #3 -; GISEL-NEXT: orr w11, w13, w11, lsl #6 -; GISEL-NEXT: orr w8, w8, w15, lsl #4 -; GISEL-NEXT: umov.b w15, v0[5] -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: and w0, w0, #0x1 -; GISEL-NEXT: and w12, w17, #0x1 -; GISEL-NEXT: umov.b w13, v0[1] -; GISEL-NEXT: orr w8, w8, w10, lsl #5 -; GISEL-NEXT: and w16, w16, #0x1 -; GISEL-NEXT: orr w9, w9, w12, lsl #4 -; GISEL-NEXT: umov.b w10, v0[0] -; GISEL-NEXT: orr w11, w11, w0, lsl #7 -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: and w12, w15, #0x1 -; GISEL-NEXT: umov.b w15, v0[2] -; GISEL-NEXT: orr w8, w8, w16, lsl #6 -; GISEL-NEXT: orr w9, w9, w12, lsl #5 -; GISEL-NEXT: umov.b w12, v0[6] -; GISEL-NEXT: strb w11, [sp, #8] -; GISEL-NEXT: and w11, w13, #0x1 -; GISEL-NEXT: umov.b w13, v0[3] -; GISEL-NEXT: orr w8, w8, w14, lsl #7 -; GISEL-NEXT: umov.b w14, v0[7] -; GISEL-NEXT: ldr b0, [sp, #8] -; GISEL-NEXT: bfi w10, w11, #1, #31 -; GISEL-NEXT: and w11, w15, #0x1 -; GISEL-NEXT: strb w8, [sp, #9] -; GISEL-NEXT: umov.b w15, v0[4] -; GISEL-NEXT: and w8, w12, #0x1 -; GISEL-NEXT: orr w10, w10, w11, lsl #2 -; GISEL-NEXT: orr w8, w9, w8, lsl #6 -; GISEL-NEXT: and w9, w13, #0x1 -; GISEL-NEXT: umov.b w11, v0[1] -; GISEL-NEXT: orr w9, w10, w9, lsl #3 -; GISEL-NEXT: umov.b w10, v0[5] -; GISEL-NEXT: umov.b w12, v0[0] -; GISEL-NEXT: and w13, w14, #0x1 -; GISEL-NEXT: umov.b w16, v0[2] -; GISEL-NEXT: umov.b w17, v0[3] -; GISEL-NEXT: and w14, w15, #0x1 -; GISEL-NEXT: umov.b w15, v0[2] -; GISEL-NEXT: orr w8, w8, w13, lsl #7 -; GISEL-NEXT: orr w9, w9, w14, lsl #4 -; GISEL-NEXT: umov.b w13, v0[6] -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: umov.b w14, v0[3] -; GISEL-NEXT: strb w8, [sp, #10] -; GISEL-NEXT: and w8, w10, #0x1 -; GISEL-NEXT: bfi w12, w11, #1, #31 -; GISEL-NEXT: orr w8, w9, w8, lsl #5 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: and w9, w15, #0x1 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: umov.b w15, v0[1] -; GISEL-NEXT: orr w9, w12, w9, lsl #2 -; GISEL-NEXT: umov.b w12, v0[5] -; GISEL-NEXT: and w13, w13, #0x1 -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: orr w8, w8, w13, lsl #6 -; GISEL-NEXT: umov.b w13, v0[0] -; GISEL-NEXT: orr w9, w9, w14, lsl #3 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: umov.b w14, v0[6] -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: umov.b w0, v0[3] -; GISEL-NEXT: orr w9, w9, w10, lsl #4 -; GISEL-NEXT: and w10, w12, #0x1 -; GISEL-NEXT: umov.b w12, v0[7] -; GISEL-NEXT: orr w8, w8, w11, lsl #7 -; GISEL-NEXT: bfi w13, w15, #1, #31 -; GISEL-NEXT: and w11, w16, #0x1 -; GISEL-NEXT: orr w9, w9, w10, lsl #5 -; GISEL-NEXT: and w10, w14, #0x1 -; GISEL-NEXT: umov.b w14, v0[4] -; GISEL-NEXT: strb w8, [sp, #11] -; GISEL-NEXT: umov.b w15, v0[1] -; GISEL-NEXT: umov.b w16, v0[3] -; GISEL-NEXT: orr w8, w9, w10, lsl #6 -; GISEL-NEXT: orr w9, w13, w11, lsl #2 -; GISEL-NEXT: and w10, w12, #0x1 -; GISEL-NEXT: and w11, w17, #0x1 -; GISEL-NEXT: umov.b w12, v0[5] -; GISEL-NEXT: umov.b w17, v0[0] -; GISEL-NEXT: orr w8, w8, w10, lsl #7 -; GISEL-NEXT: orr w9, w9, w11, lsl #3 -; GISEL-NEXT: umov.b w10, v0[1] -; GISEL-NEXT: and w11, w14, #0x1 -; GISEL-NEXT: umov.b w14, v0[0] -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: orr w9, w9, w11, lsl #4 -; GISEL-NEXT: umov.b w11, v0[2] -; GISEL-NEXT: umov.b w13, v0[6] -; GISEL-NEXT: and w12, w12, #0x1 -; GISEL-NEXT: bfi w17, w15, #1, #31 -; GISEL-NEXT: umov.b w15, v0[5] -; GISEL-NEXT: orr w9, w9, w12, lsl #5 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: umov.b w12, v0[2] -; GISEL-NEXT: bfi w14, w10, #1, #31 -; GISEL-NEXT: umov.b w10, v0[4] -; GISEL-NEXT: ldr b1, [sp, #9] -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: and w13, w13, #0x1 -; GISEL-NEXT: strb w8, [sp, #12] -; GISEL-NEXT: orr w11, w14, w11, lsl #2 -; GISEL-NEXT: and w14, w16, #0x1 -; GISEL-NEXT: umov.b w16, v0[4] -; GISEL-NEXT: and w12, w12, #0x1 -; GISEL-NEXT: and w15, w15, #0x1 -; GISEL-NEXT: orr w9, w9, w13, lsl #6 -; GISEL-NEXT: orr w11, w11, w14, lsl #3 -; GISEL-NEXT: orr w12, w17, w12, lsl #2 -; GISEL-NEXT: and w10, w10, #0x1 -; GISEL-NEXT: and w17, w0, #0x1 -; GISEL-NEXT: umov.b w0, v0[5] -; GISEL-NEXT: umov.b w14, v0[6] -; GISEL-NEXT: orr w10, w11, w10, lsl #4 -; GISEL-NEXT: orr w12, w12, w17, lsl #3 -; GISEL-NEXT: umov.b w11, v0[7] -; GISEL-NEXT: and w16, w16, #0x1 -; GISEL-NEXT: umov.b w17, v0[6] -; GISEL-NEXT: orr w10, w10, w15, lsl #5 -; GISEL-NEXT: umov.b w15, v0[7] -; GISEL-NEXT: orr w12, w12, w16, lsl #4 -; GISEL-NEXT: and w16, w0, #0x1 -; GISEL-NEXT: umov.b w0, v0[7] -; GISEL-NEXT: and w14, w14, #0x1 -; GISEL-NEXT: orr w12, w12, w16, lsl #5 -; GISEL-NEXT: orr w10, w10, w14, lsl #6 -; GISEL-NEXT: and w11, w11, #0x1 -; GISEL-NEXT: and w13, w17, #0x1 -; GISEL-NEXT: orr w9, w9, w11, lsl #7 -; GISEL-NEXT: mov.s v0[1], v1[0] -; GISEL-NEXT: orr w11, w12, w13, lsl #6 -; GISEL-NEXT: and w12, w15, #0x1 -; GISEL-NEXT: ; kill: def $d0 killed $d0 killed $q0 -; GISEL-NEXT: orr w8, w10, w12, lsl #7 -; GISEL-NEXT: and w10, w0, #0x1 -; GISEL-NEXT: strb w9, [sp, #13] -; GISEL-NEXT: orr w9, w11, w10, lsl #7 -; GISEL-NEXT: strb w8, [sp, #14] -; GISEL-NEXT: strb w9, [sp, #15] -; GISEL-NEXT: add sp, sp, #16 -; GISEL-NEXT: ret +; CHECK-GI-LABEL: vector_to_vector_cast: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: sub sp, sp, #16 +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: mov d1, v0[1] +; CHECK-GI-NEXT: umov.b w10, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w13, v0[0] +; CHECK-GI-NEXT: umov.b w14, v0[2] +; CHECK-GI-NEXT: umov.b w15, v0[3] +; CHECK-GI-NEXT: umov.b w11, v0[2] +; CHECK-GI-NEXT: umov.b w16, v0[4] +; CHECK-GI-NEXT: umov.b w17, v0[5] +; CHECK-GI-NEXT: umov.b w12, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: umov.b w0, v1[1] +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: bfi w13, w10, #1, #31 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: umov.b w8, v1[0] +; CHECK-GI-NEXT: umov.b w10, v1[2] +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w13, w13, w14, lsl #2 +; CHECK-GI-NEXT: umov.b w14, v1[3] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w0, w0, #0x1 +; CHECK-GI-NEXT: and w16, w16, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #2 +; CHECK-GI-NEXT: orr w13, w13, w15, lsl #3 +; CHECK-GI-NEXT: umov.b w15, v1[4] +; CHECK-GI-NEXT: umov.b w11, v0[6] +; CHECK-GI-NEXT: bfi w8, w0, #1, #31 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: and w17, w17, #0x1 +; CHECK-GI-NEXT: orr w13, w13, w16, lsl #4 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #2 +; CHECK-GI-NEXT: umov.b w10, v1[5] +; CHECK-GI-NEXT: umov.b w16, v1[6] +; CHECK-GI-NEXT: orr w13, w13, w17, lsl #5 +; CHECK-GI-NEXT: umov.b w17, v0[4] +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w14, lsl #3 +; CHECK-GI-NEXT: and w12, w12, #0x1 +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: umov.b w14, v1[7] +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #3 +; CHECK-GI-NEXT: orr w11, w13, w11, lsl #6 +; CHECK-GI-NEXT: orr w8, w8, w15, lsl #4 +; CHECK-GI-NEXT: umov.b w15, v0[5] +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: and w0, w0, #0x1 +; CHECK-GI-NEXT: and w12, w17, #0x1 +; CHECK-GI-NEXT: umov.b w13, v0[1] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #5 +; CHECK-GI-NEXT: and w16, w16, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #4 +; CHECK-GI-NEXT: umov.b w10, v0[0] +; CHECK-GI-NEXT: orr w11, w11, w0, lsl #7 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: and w12, w15, #0x1 +; CHECK-GI-NEXT: umov.b w15, v0[2] +; CHECK-GI-NEXT: orr w8, w8, w16, lsl #6 +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #5 +; CHECK-GI-NEXT: umov.b w12, v0[6] +; CHECK-GI-NEXT: strb w11, [sp, #8] +; CHECK-GI-NEXT: and w11, w13, #0x1 +; CHECK-GI-NEXT: umov.b w13, v0[3] +; CHECK-GI-NEXT: orr w8, w8, w14, lsl #7 +; CHECK-GI-NEXT: umov.b w14, v0[7] +; CHECK-GI-NEXT: ldr b0, [sp, #8] +; CHECK-GI-NEXT: bfi w10, w11, #1, #31 +; CHECK-GI-NEXT: and w11, w15, #0x1 +; CHECK-GI-NEXT: strb w8, [sp, #9] +; CHECK-GI-NEXT: umov.b w15, v0[4] +; CHECK-GI-NEXT: and w8, w12, #0x1 +; CHECK-GI-NEXT: orr w10, w10, w11, lsl #2 +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-GI-NEXT: and w9, w13, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[1] +; CHECK-GI-NEXT: orr w9, w10, w9, lsl #3 +; CHECK-GI-NEXT: umov.b w10, v0[5] +; CHECK-GI-NEXT: umov.b w12, v0[0] +; CHECK-GI-NEXT: and w13, w14, #0x1 +; CHECK-GI-NEXT: umov.b w16, v0[2] +; CHECK-GI-NEXT: umov.b w17, v0[3] +; CHECK-GI-NEXT: and w14, w15, #0x1 +; CHECK-GI-NEXT: umov.b w15, v0[2] +; CHECK-GI-NEXT: orr w8, w8, w13, lsl #7 +; CHECK-GI-NEXT: orr w9, w9, w14, lsl #4 +; CHECK-GI-NEXT: umov.b w13, v0[6] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[3] +; CHECK-GI-NEXT: strb w8, [sp, #10] +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: bfi w12, w11, #1, #31 +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #5 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: and w9, w15, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: umov.b w15, v0[1] +; CHECK-GI-NEXT: orr w9, w12, w9, lsl #2 +; CHECK-GI-NEXT: umov.b w12, v0[5] +; CHECK-GI-NEXT: and w13, w13, #0x1 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w13, lsl #6 +; CHECK-GI-NEXT: umov.b w13, v0[0] +; CHECK-GI-NEXT: orr w9, w9, w14, lsl #3 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[6] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[3] +; CHECK-GI-NEXT: orr w9, w9, w10, lsl #4 +; CHECK-GI-NEXT: and w10, w12, #0x1 +; CHECK-GI-NEXT: umov.b w12, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w11, lsl #7 +; CHECK-GI-NEXT: bfi w13, w15, #1, #31 +; CHECK-GI-NEXT: and w11, w16, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w10, lsl #5 +; CHECK-GI-NEXT: and w10, w14, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[4] +; CHECK-GI-NEXT: strb w8, [sp, #11] +; CHECK-GI-NEXT: umov.b w15, v0[1] +; CHECK-GI-NEXT: umov.b w16, v0[3] +; CHECK-GI-NEXT: orr w8, w9, w10, lsl #6 +; CHECK-GI-NEXT: orr w9, w13, w11, lsl #2 +; CHECK-GI-NEXT: and w10, w12, #0x1 +; CHECK-GI-NEXT: and w11, w17, #0x1 +; CHECK-GI-NEXT: umov.b w12, v0[5] +; CHECK-GI-NEXT: umov.b w17, v0[0] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #7 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #3 +; CHECK-GI-NEXT: umov.b w10, v0[1] +; CHECK-GI-NEXT: and w11, w14, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[0] +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #4 +; CHECK-GI-NEXT: umov.b w11, v0[2] +; CHECK-GI-NEXT: umov.b w13, v0[6] +; CHECK-GI-NEXT: and w12, w12, #0x1 +; CHECK-GI-NEXT: bfi w17, w15, #1, #31 +; CHECK-GI-NEXT: umov.b w15, v0[5] +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #5 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: umov.b w12, v0[2] +; CHECK-GI-NEXT: bfi w14, w10, #1, #31 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: ldr b1, [sp, #9] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w13, w13, #0x1 +; CHECK-GI-NEXT: strb w8, [sp, #12] +; CHECK-GI-NEXT: orr w11, w14, w11, lsl #2 +; CHECK-GI-NEXT: and w14, w16, #0x1 +; CHECK-GI-NEXT: umov.b w16, v0[4] +; CHECK-GI-NEXT: and w12, w12, #0x1 +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w13, lsl #6 +; CHECK-GI-NEXT: orr w11, w11, w14, lsl #3 +; CHECK-GI-NEXT: orr w12, w17, w12, lsl #2 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: and w17, w0, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[5] +; CHECK-GI-NEXT: umov.b w14, v0[6] +; CHECK-GI-NEXT: orr w10, w11, w10, lsl #4 +; CHECK-GI-NEXT: orr w12, w12, w17, lsl #3 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: and w16, w16, #0x1 +; CHECK-GI-NEXT: umov.b w17, v0[6] +; CHECK-GI-NEXT: orr w10, w10, w15, lsl #5 +; CHECK-GI-NEXT: umov.b w15, v0[7] +; CHECK-GI-NEXT: orr w12, w12, w16, lsl #4 +; CHECK-GI-NEXT: and w16, w0, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[7] +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: orr w12, w12, w16, lsl #5 +; CHECK-GI-NEXT: orr w10, w10, w14, lsl #6 +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w13, w17, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #7 +; CHECK-GI-NEXT: mov.s v0[1], v1[0] +; CHECK-GI-NEXT: orr w11, w12, w13, lsl #6 +; CHECK-GI-NEXT: and w12, w15, #0x1 +; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: orr w8, w10, w12, lsl #7 +; CHECK-GI-NEXT: and w10, w0, #0x1 +; CHECK-GI-NEXT: strb w9, [sp, #13] +; CHECK-GI-NEXT: orr w9, w11, w10, lsl #7 +; CHECK-GI-NEXT: strb w8, [sp, #14] +; CHECK-GI-NEXT: strb w9, [sp, #15] +; CHECK-GI-NEXT: add sp, sp, #16 +; CHECK-GI-NEXT: ret %bc = bitcast <16 x i1> %arg to <2 x i8> ret <2 x i8> %bc } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll new file mode 100644 index 0000000000000..f3b8deff61918 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/init-whole.wave.ll @@ -0,0 +1,17 @@ +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GCN %s + +; GCN-LABEL: name: init_wwm +; GCN: hasInitWholeWave: true +define void @init_wwm(ptr addrspace(1) inreg %p) { +entry: + %entry_exec = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %entry_exec, label %bb.1, label %bb.2 + +bb.1: + store i32 1, ptr addrspace(1) %p + br label %bb.2 + +bb.2: + ret void +} diff --git a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll index da212a1850964..1b93ae029f27b 100755 --- a/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx_movrs_intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { ; CHECK-LABEL: test_amx_internal: @@ -35,6 +36,44 @@ define void @test_amx_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_internal: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: pushq %rbp # encoding: [0x55] +; EGPR-NEXT: .cfi_def_cfa_offset 16 +; EGPR-NEXT: .cfi_offset %rbp, -16 +; EGPR-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5] +; EGPR-NEXT: .cfi_def_cfa_register %rbp +; EGPR-NEXT: andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff] +; EGPR-NEXT: # imm = 0xFC00 +; EGPR-NEXT: subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00] +; EGPR-NEXT: # imm = 0xC00 +; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00] +; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01] +; EGPR-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00] +; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; EGPR-NEXT: # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00] +; EGPR-NEXT: movw %ax, %cx # encoding: [0x66,0x89,0xc1] +; EGPR-NEXT: movw %di, %ax # encoding: [0x66,0x89,0xf8] +; EGPR-NEXT: # implicit-def: $al +; EGPR-NEXT: movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00] +; EGPR-NEXT: movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00] +; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: tileloaddrs (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x04,0x32] +; EGPR-NEXT: movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00] +; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00] +; EGPR-NEXT: tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32] +; EGPR-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec] +; EGPR-NEXT: popq %rbp # encoding: [0x5d] +; EGPR-NEXT: .cfi_def_cfa %rsp, 8 +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %t1 = call x86_amx @llvm.x86.tileloaddrs64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) @@ -48,6 +87,12 @@ define void @test_amx_old(i16 %m, i16 %n, ptr %buf) { ; CHECK-NEXT: movl $32, %eax ; CHECK-NEXT: tileloaddrs (%rdx,%rax), %tmm2 ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_old: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00] +; EGPR-NEXT: tileloaddrs (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4a,0x14,0x02] +; EGPR-NEXT: retq # encoding: [0xc3] entry: call void @llvm.x86.tileloaddrs64(i8 2, ptr %buf, i64 32) ret void @@ -88,6 +133,44 @@ define void @test_amx_t1_internal(i16 %m, i16 %n, ptr %buf, i64 %s) { ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_t1_internal: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: pushq %rbp # encoding: [0x55] +; EGPR-NEXT: .cfi_def_cfa_offset 16 +; EGPR-NEXT: .cfi_offset %rbp, -16 +; EGPR-NEXT: movq %rsp, %rbp # encoding: [0x48,0x89,0xe5] +; EGPR-NEXT: .cfi_def_cfa_register %rbp +; EGPR-NEXT: andq $-1024, %rsp # encoding: [0x48,0x81,0xe4,0x00,0xfc,0xff,0xff] +; EGPR-NEXT: # imm = 0xFC00 +; EGPR-NEXT: subq $3072, %rsp # encoding: [0x48,0x81,0xec,0x00,0x0c,0x00,0x00] +; EGPR-NEXT: # imm = 0xC00 +; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xd0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xe0,0x03,0x00,0x00] +; EGPR-NEXT: movups %xmm0, {{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x84,0x24,0xf0,0x03,0x00,0x00] +; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01] +; EGPR-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: # encoding: [0x48,0x89,0x8c,0x24,0xb8,0x03,0x00,0x00] +; EGPR-NEXT: movl %esi, %eax # encoding: [0x89,0xf0] +; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; EGPR-NEXT: # encoding: [0x48,0x8b,0xb4,0x24,0xb8,0x03,0x00,0x00] +; EGPR-NEXT: movw %ax, %cx # encoding: [0x66,0x89,0xc1] +; EGPR-NEXT: movw %di, %ax # encoding: [0x66,0x89,0xf8] +; EGPR-NEXT: # implicit-def: $al +; EGPR-NEXT: movb %al, {{[0-9]+}}(%rsp) # encoding: [0x88,0x84,0x24,0xf0,0x03,0x00,0x00] +; EGPR-NEXT: movw %cx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x8c,0x24,0xd0,0x03,0x00,0x00] +; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: tileloaddrst1 (%rdx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x04,0x32] +; EGPR-NEXT: movl $64, %esi # encoding: [0xbe,0x40,0x00,0x00,0x00] +; EGPR-NEXT: leaq {{[0-9]+}}(%rsp), %rdx # encoding: [0x48,0x8d,0x94,0x24,0x00,0x04,0x00,0x00] +; EGPR-NEXT: tilestored %tmm0, (%rdx,%rsi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x32] +; EGPR-NEXT: movq %rbp, %rsp # encoding: [0x48,0x89,0xec] +; EGPR-NEXT: popq %rbp # encoding: [0x5d] +; EGPR-NEXT: .cfi_def_cfa %rsp, 8 +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %t1 = call x86_amx @llvm.x86.tileloaddrst164.internal(i16 %m, i16 %n, ptr %buf, i64 %s) %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) @@ -101,6 +184,12 @@ define void @test_amx_t1_old(i16 %m, i16 %n, ptr %buf) { ; CHECK-NEXT: movl $32, %eax ; CHECK-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2 ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_t1_old: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00] +; EGPR-NEXT: tileloaddrst1 (%rdx,%rax), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4a,0x14,0x02] +; EGPR-NEXT: retq # encoding: [0xc3] entry: call void @llvm.x86.tileloaddrst164(i8 2, ptr %buf, i64 32) ret void diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll index 146b69773eb18..1f5758c804b2b 100755 --- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0 ; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2 +; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR define void @test_amx(i64 %stride, i8* %addr1) #0 { ; CHECK-LABEL: test_amx: @@ -10,6 +11,14 @@ define void @test_amx(i64 %stride, i8* %addr1) #0 { ; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 ; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx: +; EGPR: # %bb.0: +; EGPR-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e] +; EGPR-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e] +; EGPR-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e] +; EGPR-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e] +; EGPR-NEXT: retq # encoding: [0xc3] call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride) call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride) call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride) @@ -80,6 +89,27 @@ define void @test_amx2(i8* %base, i64 %stride) #0 { ; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 ; O2-NEXT: tilerelease ; O2-NEXT: retq +; +; EGPR-LABEL: test_amx2: +; EGPR: # %bb.0: +; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0] +; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0] +; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0] +; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0] +; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0] +; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00] +; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] +; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] +; EGPR-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37] +; EGPR-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37] +; EGPR-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37] +; EGPR-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37] +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: retq # encoding: [0xc3] call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll index cc4360317db7d..4cfd97afe721b 100644 --- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 { ; CHECK-LABEL: test_amx: @@ -16,6 +17,21 @@ define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x floa ; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 ; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1 ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx: +; EGPR: # %bb.0: +; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31] +; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31] +; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31] +; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31] +; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb] +; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca] +; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5] +; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca] +; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca] +; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca] +; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca] +; EGPR-NEXT: retq # encoding: [0xc3] call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride) call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride) call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride) @@ -78,6 +94,46 @@ define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 { ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx2: +; EGPR: # %bb.0: +; EGPR-NEXT: pushq %rbp # encoding: [0x55] +; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00] +; EGPR-NEXT: # imm = 0xB70 +; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] +; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d] +; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01] +; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08] +; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00] +; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08] +; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00] +; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08] +; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00] +; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08] +; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00] +; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00] +; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] +; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] +; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8] +; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0] +; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0] +; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0] +; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0] +; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0] +; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill +; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00] +; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload +; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00] +; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8] +; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3] +; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17] +; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00] +; EGPR-NEXT: # imm = 0xB70 +; EGPR-NEXT: popq %rbp # encoding: [0x5d] +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; EGPR-NEXT: retq # encoding: [0xc3] %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) @@ -117,6 +173,30 @@ define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 { ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx3: +; EGPR: # %bb.0: +; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] +; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff] +; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00] +; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00] +; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00] +; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0] +; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] +; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00] +; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] +; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16] +; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16] +; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16] +; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4] +; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17] +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; EGPR-NEXT: retq # encoding: [0xc3] %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride) @@ -179,6 +259,72 @@ define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq +; +; EGPR-LABEL: test_amx_spill: +; EGPR: # %bb.0: +; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00] +; EGPR-NEXT: # imm = 0x17C8 +; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0] +; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe] +; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00] +; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08] +; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00] +; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80] +; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00] +; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16] +; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16] +; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16] +; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00] +; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00] +; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16] +; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00] +; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00] +; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16] +; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0] +; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16] +; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] +; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] +; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00] +; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00] +; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] +; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] +; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00] +; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00] +; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] +; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] +; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0] +; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload +; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00] +; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16] +; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16] +; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16] +; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16] +; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00] +; EGPR-NEXT: # imm = 0x17C8 +; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0] +; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; EGPR-NEXT: retq # encoding: [0xc3] %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride) diff --git a/llvm/test/CodeGen/X86/dag-large-offset.ll b/llvm/test/CodeGen/X86/dag-large-offset.ll new file mode 100644 index 0000000000000..2774a93993153 --- /dev/null +++ b/llvm/test/CodeGen/X86/dag-large-offset.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=i386 --frame-pointer=all | FileCheck %s + +; ISel will try to fold pointer arithmetic into the address displacement. However, we don't +; want to do that if the offset is very close to the expressible limit because the final frame +; layout may push it over/under the limit. + +define i32 @foo(i1 %b) #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: movl __stack_chk_guard, %eax +; CHECK-NEXT: movl %eax, -4(%ebp) +; CHECK-NEXT: testb $1, 8(%ebp) +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT: # %bb.2: # %entry +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: movl $-2147483647, %eax # imm = 0x80000001 +; CHECK-NEXT: leal -5(%ebp,%eax), %eax +; CHECK-NEXT: .LBB0_3: # %entry +; CHECK-NEXT: movl __stack_chk_guard, %ecx +; CHECK-NEXT: cmpl -4(%ebp), %ecx +; CHECK-NEXT: jne .LBB0_5 +; CHECK-NEXT: # %bb.4: # %entry +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl +; CHECK-NEXT: .LBB0_5: # %entry +; CHECK-NEXT: .cfi_def_cfa %ebp, 8 +; CHECK-NEXT: calll __stack_chk_fail +entry: + %a = alloca i8, align 1 + %0 = ptrtoint ptr %a to i32 + %sub = add i32 %0, -2147483647 + %retval.0 = select i1 %b, i32 %sub, i32 0 + ret i32 %retval.0 +} + +attributes #0 = { sspreq } diff --git a/llvm/test/CodeGen/X86/movrs-builtins.ll b/llvm/test/CodeGen/X86/movrs-builtins.ll index c1722c831c95d..ccf0833e53990 100644 --- a/llvm/test/CodeGen/X86/movrs-builtins.ll +++ b/llvm/test/CodeGen/X86/movrs-builtins.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+movrs,+egpr | FileCheck %s --check-prefix=EGPR define i8 @test_movrs_si8(ptr %__A) { ; CHECK-LABEL: test_movrs_si8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movrsb (%rdi), %al # encoding: [0x0f,0x38,0x8a,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; EGPR-LABEL: test_movrs_si8: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movrsb (%rdi), %al # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8a,0x07] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = call i8 @llvm.x86.movrsqi(ptr %__A) ret i8 %0 @@ -17,6 +23,11 @@ define i16 @test_movrs_si16(ptr %__A) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movrsw (%rdi), %ax # encoding: [0x66,0x0f,0x38,0x8b,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; EGPR-LABEL: test_movrs_si16: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movrsw (%rdi), %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x38,0x8b,0x07] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = call i16 @llvm.x86.movrshi(ptr %__A) ret i16 %0 @@ -28,6 +39,11 @@ define i32 @test_movrs_si32(ptr %__A) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movrsl (%rdi), %eax # encoding: [0x0f,0x38,0x8b,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; EGPR-LABEL: test_movrs_si32: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movrsl (%rdi), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0x8b,0x07] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = call i32 @llvm.x86.movrssi(ptr %__A) ret i32 %0 @@ -39,6 +55,11 @@ define i64 @test_movrs_si64(ptr %__A) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movrsq (%rdi), %rax # encoding: [0x48,0x0f,0x38,0x8b,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] +; +; EGPR-LABEL: test_movrs_si64: +; EGPR: # %bb.0: # %entry +; EGPR-NEXT: movrsq (%rdi), %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x38,0x8b,0x07] +; EGPR-NEXT: retq # encoding: [0xc3] entry: %0 = call i64 @llvm.x86.movrsdi(ptr %__A) ret i64 %0 diff --git a/llvm/test/CodeGen/X86/xor-lea.ll b/llvm/test/CodeGen/X86/xor-lea.ll index 10e9525a2706a..d50752e48d293 100644 --- a/llvm/test/CodeGen/X86/xor-lea.ll +++ b/llvm/test/CodeGen/X86/xor-lea.ll @@ -327,7 +327,8 @@ define i32 @xor_shl_sminval_i32(i32 %x) { ; X86-LABEL: xor_shl_sminval_i32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal -2147483648(,%eax,8), %eax +; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 +; X86-NEXT: leal (%ecx,%eax,8), %eax ; X86-NEXT: retl ; ; X64-LABEL: xor_shl_sminval_i32: diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt index 6df44c87d2332..57e3153da401b 100755 --- a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-movrs.txt @@ -96,3 +96,99 @@ # ATT: tileloaddrst1 -32(,%rbp,2), %tmm3 # INTEL: tileloaddrst1 tmm3, [2*rbp - 32] 0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] +0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] +0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rs 64(%r18), %tmm6 +# INTEL: t2rpntlvwz0rs tmm6, [r18 + 64] +0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40 + +# ATT: t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rs tmm2, [2*rbp - 32] +0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] +0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0rst1 64(%r18), %tmm6 +# INTEL: t2rpntlvwz0rst1 tmm6, [r18 + 64] +0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40 + +# ATT: t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] +0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] +0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rs 64(%r18), %tmm6 +# INTEL: t2rpntlvwz1rs tmm6, [r18 + 64] +0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40 + +# ATT: t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rs tmm2, [2*rbp - 32] +0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] +0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1rst1 64(%r18), %tmm6 +# INTEL: t2rpntlvwz1rst1 tmm6, [r18 + 64] +0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40 + +# ATT: t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrs 268435456(%r16,%r14,8), %tmm6 +# INTEL: tileloaddrs tmm6, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrs 291(%r8,%r17,4), %tmm3 +# INTEL: tileloaddrs tmm3, [r8 + 4*r17 + 291] +0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrs 64(%r18), %tmm6 +# INTEL: tileloaddrs tmm6, [r18 + 64] +0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40 + +# ATT: tileloaddrs -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrs tmm3, [2*rbp - 32] +0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff + +# ATT: tileloaddrst1 268435456(%r16,%r14,8), %tmm6 +# INTEL: tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: tileloaddrst1 291(%r8,%r17,4), %tmm3 +# INTEL: tileloaddrst1 tmm3, [r8 + 4*r17 + 291] +0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00 + +# ATT: tileloaddrst1 64(%r18), %tmm6 +# INTEL: tileloaddrst1 tmm6, [r18 + 64] +0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40 + +# ATT: tileloaddrst1 -32(,%rbp,2), %tmm3 +# INTEL: tileloaddrst1 tmm3, [2*rbp - 32] +0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff diff --git a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt index 8c6f1be80ba2d..d768630ac1475 100644 --- a/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt +++ b/llvm/test/MC/Disassembler/X86/amx-transpose-att.txt @@ -49,6 +49,54 @@ # INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] 0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff +# ATT: t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 +# INTEL: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] +0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0 tmm2, [2*rbp - 32] +0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 +# INTEL: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz0t1 tmm2, [2*rbp - 32] +0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 +# INTEL: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1 tmm2, [2*rbp - 32] +0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 +# INTEL: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] +0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10 + +# ATT: t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 +# INTEL: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] +0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00 + +# ATT: t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +# INTEL: t2rpntlvwz1t1 tmm2, [2*rbp - 32] +0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff + # ATT: ttransposed %tmm1, %tmm2 # INTEL: ttransposed tmm2, tmm1 0xc4,0xe2,0x7a,0x5f,0xd1 diff --git a/llvm/test/MC/Disassembler/X86/movrs.txt b/llvm/test/MC/Disassembler/X86/movrs.txt index fa91b542d3f73..caac8bc8b7b30 100644 --- a/llvm/test/MC/Disassembler/X86/movrs.txt +++ b/llvm/test/MC/Disassembler/X86/movrs.txt @@ -95,4 +95,100 @@ # ATT: movrsq -128(%rdx), %rbx # INTEL: movrs rbx, qword ptr [rdx - 128] -0x48,0x0f,0x38,0x8b,0x5a,0x80 \ No newline at end of file +0x48,0x0f,0x38,0x8b,0x5a,0x80 + +# ATT: movrsb 268435456(%rbp,%r14,8), %r16b +# INTEL: movrs r16b, byte ptr [rbp + 8*r14 + 268435456] +0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10 + +# ATT: movrsb 291(%r17,%rax,4), %bl +# INTEL: movrs bl, byte ptr [r17 + 4*rax + 291] +0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00 + +# ATT: movrsb (%rip), %bl +# INTEL: movrs bl, byte ptr [rip] +0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00 + +# ATT: movrsb -32(,%rbp,2), %r18b +# INTEL: movrs r18b, byte ptr [2*rbp - 32] +0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: movrsb 127(%r19), %bl +# INTEL: movrs bl, byte ptr [r19 + 127] +0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f + +# ATT: movrsb -128(%r20,%riz), %bl +# INTEL: movrs bl, byte ptr [r20 + riz - 128] +0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80 + +# ATT: movrsw 268435456(%rbp,%r14,8), %r16w +# INTEL: movrs r16w, word ptr [rbp + 8*r14 + 268435456] +0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10 + +# ATT: movrsw 291(%r17,%rax,4), %bx +# INTEL: movrs bx, word ptr [r17 + 4*rax + 291] +0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00 + +# ATT: movrsw (%rip), %bx +# INTEL: movrs bx, word ptr [rip] +0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00 + +# ATT: movrsw -32(,%rbp,2), %r18w +# INTEL: movrs r18w, word ptr [2*rbp - 32] +0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: movrsw 127(%r19), %bx +# INTEL: movrs bx, word ptr [r19 + 127] +0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f + +# ATT: movrsw -128(%r20,%riz), %bx +# INTEL: movrs bx, word ptr [r20 + riz - 128] +0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80 + +# ATT: movrsl 268435456(%rbp,%r14,8), %r16d +# INTEL: movrs r16d, dword ptr [rbp + 8*r14 + 268435456] +0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10 + +# ATT: movrsl 291(%r17,%rax,4), %ebx +# INTEL: movrs ebx, dword ptr [r17 + 4*rax + 291] +0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00 + +# ATT: movrsl (%rip), %ebx +# INTEL: movrs ebx, dword ptr [rip] +0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00 + +# ATT: movrsl -32(,%rbp,2), %r18d +# INTEL: movrs r18d, dword ptr [2*rbp - 32] +0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: movrsl 127(%r19), %ebx +# INTEL: movrs ebx, dword ptr [r19 + 127] +0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f + +# ATT: movrsl -128(%r20,%riz), %ebx +# INTEL: movrs ebx, dword ptr [r20 + riz - 128] +0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80 + +# ATT: movrsq 268435456(%rbp,%r14,8), %r16 +# INTEL: movrs r16, qword ptr [rbp + 8*r14 + 268435456] +0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10 + +# ATT: movrsq 291(%r17,%rax,4), %rbx +# INTEL: movrs rbx, qword ptr [r17 + 4*rax + 291] +0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00 + +# ATT: movrsq (%rip), %rbx +# INTEL: movrs rbx, qword ptr [rip] +0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00 + +# ATT: movrsq -32(,%rbp,2), %r18 +# INTEL: movrs r18, qword ptr [2*rbp - 32] +0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff + +# ATT: movrsq 127(%r19), %rbx +# INTEL: movrs rbx, qword ptr [r19 + 127] +0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f + +# ATT: movrsq -128(%r20,%riz), %rbx +# INTEL: movrs rbx, qword ptr [r20 + riz - 128] +0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80 diff --git a/llvm/test/MC/LoongArch/Macros/aliases-la.s b/llvm/test/MC/LoongArch/Macros/aliases-la.s index dd5a4d474e001..1b5b818f4348f 100644 --- a/llvm/test/MC/LoongArch/Macros/aliases-la.s +++ b/llvm/test/MC/LoongArch/Macros/aliases-la.s @@ -3,13 +3,26 @@ # RUN: llvm-mc --triple=loongarch64 %s \ # RUN: | FileCheck %s --check-prefix=NORMAL +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax +# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-pcrel < %s \ # RUN: | FileCheck %s --check-prefix=GTOPCR +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \ +# RUN: --mattr=-relax %s -o %t +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOPCR-RELOC +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-pcrel \ +# RUN: --mattr=+relax %s -o %t.relax +# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOPCR-RELOC,GTOPCR-RELAX # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs < %s \ # RUN: | FileCheck %s --check-prefix=GTOABS # RUN: llvm-mc --triple=loongarch64 --mattr=+la-local-with-abs < %s \ # RUN: | FileCheck %s --check-prefix=LTOABS +# RELOC: Relocations [ +# RELOC-NEXT: Section ({{.*}}) .rela.text { + la $a0, sym # NORMAL: pcalau12i $a0, %got_pc_hi20(sym) # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym) @@ -22,6 +35,16 @@ la $a0, sym # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym) # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym) +# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 +# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + +# GTOPCR-RELOC: R_LARCH_PCALA_HI20 sym 0x0 +# GTOPCR-RELAX: R_LARCH_RELAX - 0x0 +# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym 0x0 +# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0 + la.global $a0, sym_global # NORMAL: pcalau12i $a0, %got_pc_hi20(sym_global) # NORMAL-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym_global) @@ -34,6 +57,16 @@ la.global $a0, sym_global # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global) # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global) +# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 +# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + +# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_HI20 sym_global 0x0 +# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0 +# GTOPCR-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_global 0x0 +# GTOPCR-RELAX-NEXT: R_LARCH_RELAX - 0x0 + la.global $a0, $a1, sym_global_large # NORMAL: pcalau12i $a0, %got_pc_hi20(sym_global_large) # NORMAL-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_global_large) @@ -52,6 +85,11 @@ la.global $a0, $a1, sym_global_large # GTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_global_large) # GTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_global_large) +# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_global_large 0x0 +# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_global_large 0x0 +# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_global_large 0x0 +# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_global_large 0x0 + la.local $a0, sym_local # NORMAL: pcalau12i $a0, %pc_hi20(sym_local) # NORMAL-NEXT: addi.d $a0, $a0, %pc_lo12(sym_local) @@ -61,6 +99,11 @@ la.local $a0, sym_local # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local) # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local) +# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 +# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + la.local $a0, $a1, sym_local_large # NORMAL: pcalau12i $a0, %pc_hi20(sym_local_large) # NORMAL-NEXT: addi.d $a1, $zero, %pc_lo12(sym_local_large) @@ -72,3 +115,12 @@ la.local $a0, $a1, sym_local_large # LTOABS-NEXT: ori $a0, $a0, %abs_lo12(sym_local_large) # LTOABS-NEXT: lu32i.d $a0, %abs64_lo20(sym_local_large) # LTOABS-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_local_large) + +# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_local_large 0x0 +# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_local_large 0x0 +# RELOC-NEXT: R_LARCH_PCALA64_LO20 sym_local_large 0x0 +# RELOC-NEXT: R_LARCH_PCALA64_HI12 sym_local_large 0x0 + + +# RELOC-NEXT: } +# RELOC-NEXT: ] diff --git a/llvm/test/MC/LoongArch/Macros/macros-call.s b/llvm/test/MC/LoongArch/Macros/macros-call.s index a648a39780381..df7715050a0f9 100644 --- a/llvm/test/MC/LoongArch/Macros/macros-call.s +++ b/llvm/test/MC/LoongArch/Macros/macros-call.s @@ -1,9 +1,26 @@ # RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax +# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX + +# RELOC: Relocations [ +# RELOC-NEXT: Section ({{.*}}) .rela.text { call36 sym_call # CHECK: pcaddu18i $ra, %call36(sym_call) # CHECK-NEXT: jirl $ra, $ra, 0 +# RELOC-NEXT: R_LARCH_CALL36 sym_call 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + tail36 $t0, sym_tail # CHECK: pcaddu18i $t0, %call36(sym_tail) # CHECK-NEXT: jr $t0 + +# RELOC-NEXT: R_LARCH_CALL36 sym_tail 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 + + +# RELOC-NEXT: } +# RELOC-NEXT: ] diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s index d4272b93ba54d..a732988ef1f1a 100644 --- a/llvm/test/MC/LoongArch/Macros/macros-la.s +++ b/llvm/test/MC/LoongArch/Macros/macros-la.s @@ -5,6 +5,12 @@ # RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX # RUN: llvm-mc --triple=loongarch64 --mattr=+la-global-with-abs \ # RUN: %s | FileCheck %s --check-prefix=ABS +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \ +# RUN: --mattr=-relax %s -o %t +# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=GTOABS-RELOC +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+la-global-with-abs \ +# RUN: --mattr=+relax %s -o %t.relax +# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=GTOABS-RELOC,GTOABS-RELAX # RELOC: Relocations [ # RELOC-NEXT: Section ({{.*}}) .rela.text { @@ -36,6 +42,10 @@ la.pcrel $a0, sym_pcrel # RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0 # RELAX-NEXT: R_LARCH_RELAX - 0x0 +# GTOABS-RELOC: R_LARCH_PCALA_HI20 sym_pcrel 0x0 +# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0 +# GTOABS-RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0 +# GTOABS-RELAX-NEXT: R_LARCH_RELAX - 0x0 la.got $a0, sym_got # CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got) @@ -73,7 +83,9 @@ la.tls.ie $a0, sym_ie # ABS-NEXT: ld.d $a0, $a0, 0 # ABS-EMPTY: # RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 la.tls.ld $a0, sym_ld # CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld) @@ -85,7 +97,9 @@ la.tls.ld $a0, sym_ld # ABS-NEXT: lu52i.d $a0, $a0, %got64_hi12(sym_ld) # ABS-EMPTY: # RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 la.tls.gd $a0, sym_gd # CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd) @@ -97,7 +111,9 @@ la.tls.gd $a0, sym_gd # ABS-NEXT: lu52i.d $a0, $a0, %got64_hi12(sym_gd) # ABS-EMPTY: # RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 la.tls.desc $a0, sym_desc # CHECK-NEXT: pcalau12i $a0, %desc_pc_hi20(sym_desc) @@ -113,9 +129,13 @@ la.tls.desc $a0, sym_desc # ABS-NEXT: jirl $ra, $ra, %desc_call(sym_desc) # ABS-EMPTY: # RELOC-NEXT: R_LARCH_TLS_DESC_PC_HI20 sym_desc 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_TLS_DESC_PC_LO12 sym_desc 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_TLS_DESC_LD sym_desc 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 # RELOC-NEXT: R_LARCH_TLS_DESC_CALL sym_desc 0x0 +# RELAX-NEXT: R_LARCH_RELAX - 0x0 ############################################################# ## with a temporary register. diff --git a/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s new file mode 100644 index 0000000000000..899f12f85654d --- /dev/null +++ b/llvm/test/MC/LoongArch/Relocations/relax-tls-le.s @@ -0,0 +1,70 @@ +# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=+relax < %s \ +# RUN: | llvm-readobj -r - | FileCheck --check-prefix=LA32-RELAX-RELOC %s +# RUN: llvm-mc --filetype=obj --triple=loongarch32 --mattr=-relax < %s \ +# RUN: | llvm-readobj -r - | FileCheck --check-prefix=LA32-NORELAX-RELOC %s +# RUN: llvm-mc --triple=loongarch32 --mattr=+relax < %s --show-encoding \ +# RUN: | FileCheck --check-prefix=LA32-RELAX-FIXUP %s + +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s \ +# RUN: | llvm-readobj -r - | FileCheck --check-prefix=LA64-RELAX-RELOC %s +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax --defsym=LA64=1 < %s \ +# RUN: | llvm-readobj -r - | FileCheck --check-prefix=LA64-NORELAX-RELOC %s +# RUN: llvm-mc --triple=loongarch64 --mattr=+relax --defsym=LA64=1 < %s --show-encoding \ +# RUN: | FileCheck --check-prefix=LA64-RELAX-FIXUP %s + +.long foo + +.ifndef LA64 + +lu12i.w $a0, %le_hi20_r(foo) +# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0 +# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA32-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0 +# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE +# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +add.w $a0, $a0, $tp, %le_add_r(foo) +# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0 +# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA32-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0 +# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE +# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +addi.w $a0, $a0, %le_lo12_r(foo) +# LA32-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0 +# LA32-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA32-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0 +# LA32-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA32-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE +# LA32-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +.else + +lu12i.w $a0, %le_hi20_r(foo) +# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0 +# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA64-RELAX-RELOC: R_LARCH_TLS_LE_HI20_R foo 0x0 +# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_hi20_r(foo), kind: FK_NONE +# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +add.d $a0, $a0, $tp, %le_add_r(foo) +# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0 +# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA64-RELAX-RELOC: R_LARCH_TLS_LE_ADD_R foo 0x0 +# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_add_r(foo), kind: FK_NONE +# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +addi.d $a0, $a0, %le_lo12_r(foo) +# LA64-NORELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0 +# LA64-NORELAX-RELOC-NOT: R_LARCH_RELAX - 0x0 +# LA64-RELAX-RELOC: R_LARCH_TLS_LE_LO12_R foo 0x0 +# LA64-RELAX-RELOC: R_LARCH_RELAX - 0x0 +# LA64-RELAX-FIXUP: fixup A - offset: 0, value: %le_lo12_r(foo), kind: FK_NONE +# LA64-RELAX-FIXUP: fixup B - offset: 0, value: 0, kind: FK_NONE + +.endif + diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s index d780ad4f0e369..92db672e1c82d 100755 --- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-att.s @@ -86,4 +86,92 @@ // CHECK: tileloaddrst1 -32(,%rbp,2), %tmm3 // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] - tileloaddrst1 -32(,%rbp,2), %tmm3 \ No newline at end of file + tileloaddrst1 -32(,%rbp,2), %tmm3 + +// CHECK: t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 +// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs 268435456(%r16,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs 291(%r8,%r17,4), %tmm2 + +// CHECK: t2rpntlvwz0rs 64(%r18), %tmm6 +// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40] + t2rpntlvwz0rs 64(%r18), %tmm6 + +// CHECK: {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 +// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 268435456(%r16,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 291(%r8,%r17,4), %tmm2 + +// CHECK: t2rpntlvwz0rst1 64(%r18), %tmm6 +// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40] + t2rpntlvwz0rst1 64(%r18), %tmm6 + +// CHECK: {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0rst1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 +// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs 268435456(%r16,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs 291(%r8,%r17,4), %tmm2 + +// CHECK: t2rpntlvwz1rs 64(%r18), %tmm6 +// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40] + t2rpntlvwz1rs 64(%r18), %tmm6 + +// CHECK: {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1rs -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 +// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 268435456(%r16,%r14,8), %tmm6 + +// CHECK: t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 291(%r8,%r17,4), %tmm2 + +// CHECK: t2rpntlvwz1rst1 64(%r18), %tmm6 +// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40] + t2rpntlvwz1rst1 64(%r18), %tmm6 + +// CHECK: {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1rst1 -32(,%rbp,2), %tmm2 + +// CHECK: tileloaddrs 291(%r16,%rax,4), %tmm3 +// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrs 291(%r16,%rax,4), %tmm3 + +// CHECK: tileloaddrs 291(%r8,%r17,4), %tmm3 +// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00] + tileloaddrs 291(%r8,%r17,4), %tmm3 + +// CHECK: {evex} tileloaddrs -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + {evex} tileloaddrs -32(,%rbp,2), %tmm3 + +// CHECK: tileloaddrst1 291(%r16,%rax,4), %tmm3 +// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x9c,0x80,0x23,0x01,0x00,0x00] + tileloaddrst1 291(%r16,%rax,4), %tmm3 + +// CHECK: tileloaddrst1 291(%r8,%r17,4), %tmm3 +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00] + tileloaddrst1 291(%r8,%r17,4), %tmm3 + +// CHECK: {evex} tileloaddrst1 -32(,%rbp,2), %tmm3 +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + {evex} tileloaddrst1 -32(,%rbp,2), %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s index ccc7ac51a98a4..140d1aa6b198e 100755 --- a/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s +++ b/llvm/test/MC/X86/AMX/x86-64-amx-movrs-intel.s @@ -95,3 +95,99 @@ // CHECK: tileloaddrst1 tmm3, [2*rbp - 32] // CHECK: encoding: [0xc4,0xe2,0x79,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] tileloaddrst1 tmm3, [2*rbp - 32] + +// CHECK: t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0rs tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0rs tmm2, [r8 + 4*r17 + 291] + +// CHECK: t2rpntlvwz0rs tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf8,0x74,0x22,0x40] + t2rpntlvwz0rs tmm6, [r18 + 64] + +// CHECK: {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xbd,0x7c,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0rst1 tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd5,0x78,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0rst1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: t2rpntlvwz0rst1 tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfd,0x7c,0x08,0xf9,0x74,0x22,0x40] + t2rpntlvwz0rst1 tmm6, [r18 + 64] + +// CHECK: {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0rst1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf8,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1rs tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf8,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1rs tmm2, [r8 + 4*r17 + 291] + +// CHECK: t2rpntlvwz1rs tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf8,0x74,0x22,0x40] + t2rpntlvwz1rs tmm6, [r18 + 64] + +// CHECK: {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf8,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1rs tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xbd,0x7d,0x08,0xf9,0xb4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1rst1 tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd5,0x79,0x08,0xf9,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1rst1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: t2rpntlvwz1rst1 tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfd,0x7d,0x08,0xf9,0x74,0x22,0x40] + t2rpntlvwz1rst1 tmm6, [r18 + 64] + +// CHECK: {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0xf9,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1rst1 tmm2, [2*rbp - 32] + +// CHECK: tileloaddrs tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7f,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10] + tileloaddrs tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: tileloaddrs tmm3, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x7b,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00] + tileloaddrs tmm3, [r8 + 4*r17 + 291] + +// CHECK: tileloaddrs tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfa,0x7f,0x08,0x4a,0x74,0x22,0x40] + tileloaddrs tmm6, [r18 + 64] + +// CHECK: {evex} tileloaddrs tmm3, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7f,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + {evex} tileloaddrs tmm3, [2*rbp - 32] + +// CHECK: tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x4a,0xb4,0xf0,0x00,0x00,0x00,0x10] + tileloaddrst1 tmm6, [r16 + 8*r14 + 268435456] + +// CHECK: tileloaddrst1 tmm3, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x4a,0x9c,0x88,0x23,0x01,0x00,0x00] + tileloaddrst1 tmm3, [r8 + 4*r17 + 291] + +// CHECK: tileloaddrst1 tmm6, [r18 + 64] +// CHECK: encoding: [0x62,0xfa,0x7d,0x08,0x4a,0x74,0x22,0x40] + tileloaddrst1 tmm6, [r18 + 64] + +// CHECK: {evex} tileloaddrst1 tmm3, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x4a,0x1c,0x6d,0xe0,0xff,0xff,0xff] + {evex} tileloaddrst1 tmm3, [2*rbp - 32] diff --git a/llvm/test/MC/X86/amx-transpose-att.s b/llvm/test/MC/X86/amx-transpose-att.s index 21bbf258ac6ef..5158470f8c905 100644 --- a/llvm/test/MC/X86/amx-transpose-att.s +++ b/llvm/test/MC/X86/amx-transpose-att.s @@ -48,6 +48,54 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +// CHECK: t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 +// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0 268435456(%r16,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz0 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0 291(%r8,%r17,4), %tmm2 + +// CHECK: {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 +// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0t1 268435456(%r16,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0t1 291(%r8,%r17,4), %tmm2 + +// CHECK: {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0t1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1 268435456(%r16,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1 291(%r8,%r17,4), %tmm2 + +// CHECK: {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1 -32(,%rbp,2), %tmm2 + +// CHECK: t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1t1 268435456(%r16,%r14,8), %tmm4 + +// CHECK: t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1t1 291(%r8,%r17,4), %tmm2 + +// CHECK: {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1t1 -32(,%rbp,2), %tmm2 + // CHECK: ttransposed %tmm1, %tmm5 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] ttransposed %tmm1, %tmm5 diff --git a/llvm/test/MC/X86/amx-transpose-intel.s b/llvm/test/MC/X86/amx-transpose-intel.s index a772232ddbbf2..0d2c22f67a173 100644 --- a/llvm/test/MC/X86/amx-transpose-intel.s +++ b/llvm/test/MC/X86/amx-transpose-intel.s @@ -48,6 +48,54 @@ // CHECK: encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] t2rpntlvwz1t1 tmm2, [2*rbp - 32] +// CHECK: t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0 tmm4, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0 tmm2, [r8 + 4*r17 + 291] + +// CHECK: {evex} t2rpntlvwz0 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7c,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz0t1 tmm4, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x78,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz0t1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7c,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz0t1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6e,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1 tmm4, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6e,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: {evex} t2rpntlvwz1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6e,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1 tmm2, [2*rbp - 32] + +// CHECK: t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xba,0x7d,0x08,0x6f,0xa4,0xf0,0x00,0x00,0x00,0x10] + t2rpntlvwz1t1 tmm4, [r16 + 8*r14 + 268435456] + +// CHECK: t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] +// CHECK: encoding: [0x62,0xd2,0x79,0x08,0x6f,0x94,0x88,0x23,0x01,0x00,0x00] + t2rpntlvwz1t1 tmm2, [r8 + 4*r17 + 291] + +// CHECK: {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32] +// CHECK: encoding: [0x62,0xf2,0x7d,0x08,0x6f,0x14,0x6d,0xe0,0xff,0xff,0xff] + {evex} t2rpntlvwz1t1 tmm2, [2*rbp - 32] + // CHECK: ttransposed tmm5, tmm1 // CHECK: encoding: [0xc4,0xe2,0x7a,0x5f,0xe9] ttransposed tmm5, tmm1 diff --git a/llvm/test/MC/X86/movrs-att-64.s b/llvm/test/MC/X86/movrs-att-64.s index 59a2fdb6d10b2..e951b30369d46 100644 --- a/llvm/test/MC/X86/movrs-att-64.s +++ b/llvm/test/MC/X86/movrs-att-64.s @@ -94,4 +94,100 @@ // CHECK: movrsq -128(%rdx), %rbx // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80] - movrs -128(%rdx), %rbx \ No newline at end of file + movrs -128(%rdx), %rbx + +// CHECK: movrsb 268435456(%rbp,%r14,8), %r16b +// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs 268435456(%rbp,%r14,8), %r16b + +// CHECK: movrsb 291(%r17,%rax,4), %bl +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs 291(%r17,%rax,4), %bl + +// CHECK: {evex} movrsb (%rip), %bl +// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs (%rip), %bl + +// CHECK: movrsb -32(,%rbp,2), %r18b +// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs -32(,%rbp,2), %r18b + +// CHECK: movrsb 127(%r19), %bl +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f] + movrs 127(%r19), %bl + +// CHECK: movrsb -128(%r20), %bl +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80] + movrs -128(%r20), %bl + +// CHECK: movrsw 268435456(%rbp,%r14,8), %r16w +// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs 268435456(%rbp,%r14,8), %r16w + +// CHECK: movrsw 291(%r17,%rax,4), %bx +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs 291(%r17,%rax,4), %bx + +// CHECK: {evex} movrsw (%rip), %bx +// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs (%rip), %bx + +// CHECK: movrsw -32(,%rbp,2), %r18w +// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs -32(,%rbp,2), %r18w + +// CHECK: movrsw 127(%r19), %bx +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f] + movrs 127(%r19), %bx + +// CHECK: movrsw -128(%r20), %bx +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80] + movrs -128(%r20), %bx + +// CHECK: movrsl 268435456(%rbp,%r14,8), %r16d +// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs 268435456(%rbp,%r14,8), %r16d + +// CHECK: movrsl 291(%r17,%rax,4), %ebx +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs 291(%r17,%rax,4), %ebx + +// CHECK: {evex} movrsl (%rip), %ebx +// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs (%rip), %ebx + +// CHECK: movrsl -32(,%rbp,2), %r18d +// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs -32(,%rbp,2), %r18d + +// CHECK: movrsl 127(%r19), %ebx +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f] + movrs 127(%r19), %ebx + +// CHECK: movrsl -128(%r20), %ebx +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80] + movrs -128(%r20), %ebx + +// CHECK: movrsq 268435456(%rbp,%r14,8), %r16 +// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs 268435456(%rbp,%r14,8), %r16 + +// CHECK: movrsq 291(%r17,%rax,4), %rbx +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs 291(%r17,%rax,4), %rbx + +// CHECK: {evex} movrsq (%rip), %rbx +// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs (%rip), %rbx + +// CHECK: movrsq -32(,%rbp,2), %r18 +// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs -32(,%rbp,2), %r18 + +// CHECK: movrsq 127(%r19), %rbx +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f] + movrs 127(%r19), %rbx + +// CHECK: movrsq -128(%r20), %rbx +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80] + movrs -128(%r20), %rbx diff --git a/llvm/test/MC/X86/movrs-intel-64.s b/llvm/test/MC/X86/movrs-intel-64.s index f41075a21b3e8..f698f1c440442 100644 --- a/llvm/test/MC/X86/movrs-intel-64.s +++ b/llvm/test/MC/X86/movrs-intel-64.s @@ -94,4 +94,100 @@ // CHECK: movrs rbx, qword ptr [rdx - 128] // CHECK: encoding: [0x48,0x0f,0x38,0x8b,0x5a,0x80] - movrs rbx, qword ptr [rdx - 128] \ No newline at end of file + movrs rbx, qword ptr [rdx - 128] + +// CHECK: movrs r16b, byte ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8a,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs r16b, byte ptr [rbp + 8*r14 + 268435456] + +// CHECK: movrs bl, byte ptr [r17 + 4*rax + 291] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs bl, byte ptr [r17 + 4*rax + 291] + +// CHECK: {evex} movrs bl, byte ptr [rip] +// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8a,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs bl, byte ptr [rip] + +// CHECK: movrs r18b, byte ptr [2*rbp - 32] +// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8a,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs r18b, byte ptr [2*rbp - 32] + +// CHECK: movrs bl, byte ptr [r19 + 127] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5b,0x7f] + movrs bl, byte ptr [r19 + 127] + +// CHECK: movrs bl, byte ptr [r20 - 128] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8a,0x5c,0x24,0x80] + movrs bl, byte ptr [r20 - 128] + +// CHECK: movrs r16w, word ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa4,0x7d,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs r16w, word ptr [rbp + 8*r14 + 268435456] + +// CHECK: movrs bx, word ptr [r17 + 4*rax + 291] +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs bx, word ptr [r17 + 4*rax + 291] + +// CHECK: {evex} movrs bx, word ptr [rip] +// CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs bx, word ptr [rip] + +// CHECK: movrs r18w, word ptr [2*rbp - 32] +// CHECK: encoding: [0x62,0xe4,0x7d,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs r18w, word ptr [2*rbp - 32] + +// CHECK: movrs bx, word ptr [r19 + 127] +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5b,0x7f] + movrs bx, word ptr [r19 + 127] + +// CHECK: movrs bx, word ptr [r20 - 128] +// CHECK: encoding: [0x62,0xfc,0x7d,0x08,0x8b,0x5c,0x24,0x80] + movrs bx, word ptr [r20 - 128] + +// CHECK: movrs r16d, dword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa4,0x7c,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs r16d, dword ptr [rbp + 8*r14 + 268435456] + +// CHECK: movrs ebx, dword ptr [r17 + 4*rax + 291] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs ebx, dword ptr [r17 + 4*rax + 291] + +// CHECK: {evex} movrs ebx, dword ptr [rip] +// CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs ebx, dword ptr [rip] + +// CHECK: movrs r18d, dword ptr [2*rbp - 32] +// CHECK: encoding: [0x62,0xe4,0x7c,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs r18d, dword ptr [2*rbp - 32] + +// CHECK: movrs ebx, dword ptr [r19 + 127] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5b,0x7f] + movrs ebx, dword ptr [r19 + 127] + +// CHECK: movrs ebx, dword ptr [r20 - 128] +// CHECK: encoding: [0x62,0xfc,0x7c,0x08,0x8b,0x5c,0x24,0x80] + movrs ebx, dword ptr [r20 - 128] + +// CHECK: movrs r16, qword ptr [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0x62,0xa4,0xfc,0x08,0x8b,0x84,0xf5,0x00,0x00,0x00,0x10] + movrs r16, qword ptr [rbp + 8*r14 + 268435456] + +// CHECK: movrs rbx, qword ptr [r17 + 4*rax + 291] +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x9c,0x81,0x23,0x01,0x00,0x00] + movrs rbx, qword ptr [r17 + 4*rax + 291] + +// CHECK: {evex} movrs rbx, qword ptr [rip] +// CHECK: encoding: [0x62,0xf4,0xfc,0x08,0x8b,0x1d,0x00,0x00,0x00,0x00] + {evex} movrs rbx, qword ptr [rip] + +// CHECK: movrs r18, qword ptr [2*rbp - 32] +// CHECK: encoding: [0x62,0xe4,0xfc,0x08,0x8b,0x14,0x6d,0xe0,0xff,0xff,0xff] + movrs r18, qword ptr [2*rbp - 32] + +// CHECK: movrs rbx, qword ptr [r19 + 127] +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5b,0x7f] + movrs rbx, qword ptr [r19 + 127] + +// CHECK: movrs rbx, qword ptr [r20 - 128] +// CHECK: encoding: [0x62,0xfc,0xfc,0x08,0x8b,0x5c,0x24,0x80] + movrs rbx, qword ptr [r20 - 128] diff --git a/llvm/test/TableGen/x86-instr-mapping.inc b/llvm/test/TableGen/x86-instr-mapping.inc index ed43684db2dfc..4f64d4b8d93d0 100644 --- a/llvm/test/TableGen/x86-instr-mapping.inc +++ b/llvm/test/TableGen/x86-instr-mapping.inc @@ -133,6 +133,10 @@ static const X86TableEntry X86CompressEVEXTable[] = { { X86::MOVDIR64B64_EVEX, X86::MOVDIR64B64 }, { X86::MOVDIRI32_EVEX, X86::MOVDIRI32 }, { X86::MOVDIRI64_EVEX, X86::MOVDIRI64 }, + { X86::MOVRS16rm_EVEX, X86::MOVRS16rm }, + { X86::MOVRS32rm_EVEX, X86::MOVRS32rm }, + { X86::MOVRS64rm_EVEX, X86::MOVRS64rm }, + { X86::MOVRS8rm_EVEX, X86::MOVRS8rm }, { X86::MULX32rm_EVEX, X86::MULX32rm }, { X86::MULX32rr_EVEX, X86::MULX32rr }, { X86::MULX64rm_EVEX, X86::MULX64rm }, @@ -163,6 +167,16 @@ static const X86TableEntry X86CompressEVEXTable[] = { { X86::SHRX64rm_EVEX, X86::SHRX64rm }, { X86::SHRX64rr_EVEX, X86::SHRX64rr }, { X86::STTILECFG_EVEX, X86::STTILECFG }, + { X86::T2RPNTLVWZ0RST1_EVEX, X86::T2RPNTLVWZ0RST1 }, + { X86::T2RPNTLVWZ0RS_EVEX, X86::T2RPNTLVWZ0RS }, + { X86::T2RPNTLVWZ0T1_EVEX, X86::T2RPNTLVWZ0T1 }, + { X86::T2RPNTLVWZ0_EVEX, X86::T2RPNTLVWZ0 }, + { X86::T2RPNTLVWZ1RST1_EVEX, X86::T2RPNTLVWZ1RST1 }, + { X86::T2RPNTLVWZ1RS_EVEX, X86::T2RPNTLVWZ1RS }, + { X86::T2RPNTLVWZ1T1_EVEX, X86::T2RPNTLVWZ1T1 }, + { X86::T2RPNTLVWZ1_EVEX, X86::T2RPNTLVWZ1 }, + { X86::TILELOADDRST1_EVEX, X86::TILELOADDRST1 }, + { X86::TILELOADDRS_EVEX, X86::TILELOADDRS }, { X86::TILELOADDT1_EVEX, X86::TILELOADDT1 }, { X86::TILELOADD_EVEX, X86::TILELOADD }, { X86::TILESTORED_EVEX, X86::TILESTORED }, diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll new file mode 100644 index 0000000000000..4b6a19d3f05cf --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -0,0 +1,368 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4 + +; REQUIRES: aarch64-registered-target + +; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +$test_single_bb_resolver.resolver = comdat any +$test_multi_bb_resolver.resolver = comdat any +$test_caller_feats_not_implied.resolver = comdat any +$test_non_fmv_caller.resolver = comdat any +$test_priority.resolver = comdat any +$test_alternative_names.resolver = comdat any + +@__aarch64_cpu_features = external local_unnamed_addr global { i64 } + +@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver +@test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver +@test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver +@test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver +@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver +@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver + +declare void @__init_cpu_features_resolver() local_unnamed_addr + +declare i32 @test_single_bb_resolver.default() #0 +declare i32 @test_single_bb_resolver._Msve() #1 +declare i32 @test_single_bb_resolver._Msve2() #2 + +define weak_odr ptr @test_single_bb_resolver.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve + %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2 + ret ptr %common.ret.op +} + +define i32 @caller1._Msve() #1 { +; CHECK-LABEL: define i32 @caller1._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +define i32 @caller1._Msve2() #2 { +; CHECK-LABEL: define i32 @caller1._Msve2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +define i32 @caller1.default() #0 { +; CHECK-LABEL: define i32 @caller1.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +declare i32 @test_multi_bb_resolver._Mmops() #3 +declare i32 @test_multi_bb_resolver._Msve2() #2 +declare i32 @test_multi_bb_resolver._Msve() #1 +declare i32 @test_multi_bb_resolver.default() #0 + +define weak_odr ptr @test_multi_bb_resolver.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 68719476736 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve + br label %common.ret +} + +define i32 @caller2._MmopsMsve2() #4 { +; CHECK-LABEL: define i32 @caller2._MmopsMsve2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2._Mmops() #3 { +; CHECK-LABEL: define i32 @caller2._Mmops( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2._Msve() #1 { +; CHECK-LABEL: define i32 @caller2._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2.default() #0 { +; CHECK-LABEL: define i32 @caller2.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +declare i32 @test_caller_feats_not_implied._Mmops() #3 +declare i32 @test_caller_feats_not_implied._Msme() #5 +declare i32 @test_caller_feats_not_implied._Msve() #1 +declare i32 @test_caller_feats_not_implied.default() #0 + +define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 4398046511104 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve + br label %common.ret +} + +define i32 @caller3._Mmops() #3 { +; CHECK-LABEL: define i32 @caller3._Mmops( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @caller3._Msve() #1 { +; CHECK-LABEL: define i32 @caller3._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @caller3.default() #0 { +; CHECK-LABEL: define i32 @caller3.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +declare i32 @test_non_fmv_caller._Maes() #6 +declare i32 @test_non_fmv_caller._Msm4() #7 +declare i32 @test_non_fmv_caller.default() #0 + +define weak_odr ptr @test_non_fmv_caller.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_non_fmv_caller.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 32768 + %.not = icmp eq i64 %1, 0 + %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes + ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default +} + +define i32 @caller4() #8 { +; CHECK-LABEL: define i32 @caller4( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes() +; +entry: + %call = tail call i32 @test_non_fmv_caller() + ret i32 %call +} + +define i32 @caller5() #9 { +; CHECK-LABEL: define i32 @caller5( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller() +; +entry: + %call = tail call i32 @test_non_fmv_caller() + ret i32 %call +} + +declare i32 @test_priority._Msve2-sha3() #10 +declare i32 @test_priority._Mls64Mssbs() #11 +declare i32 @test_priority._MflagmMlseMrng() #12 +declare i32 @test_priority.default() #0 + +define weak_odr ptr @test_priority.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_priority.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 131 + %2 = icmp eq i64 %1, 131 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 9570149208162304 + %4 = icmp eq i64 %3, 9570149208162304 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 1099511627776 + %.not = icmp eq i64 %5, 0 + %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3 + br label %common.ret +} + +define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 { +; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs() +; +entry: + %call = tail call i32 @test_priority() + ret i32 %call +} + +declare i32 @test_alternative_names._Mdpb2Mfrintts() #14 +declare i32 @test_alternative_names._Mflagm2Mfrintts() #15 +declare i32 @test_alternative_names._Mrcpc2() #16 +declare i32 @test_alternative_names.default() #0 + +define weak_odr ptr @test_alternative_names.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_alternative_names.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 17563904 + %2 = icmp eq i64 %1, 17563904 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_alternative_names._Mdpb2Mfrintts, %resolver_entry ], [ @test_alternative_names._Mflagm2Mfrintts, %resolver_else ], [ %test_alternative_names._Mrcpc2.test_alternative_names.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 16777478 + %4 = icmp eq i64 %3, 16777478 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 12582912 + %6 = icmp eq i64 %5, 12582912 + %test_alternative_names._Mrcpc2.test_alternative_names.default = select i1 %6, ptr @test_alternative_names._Mrcpc2, ptr @test_alternative_names.default + br label %common.ret +} + +define i32 @caller7._Mdpb2Mfrintts() #14 { +; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7._Mfrintts() #17 { +; CHECK-LABEL: define i32 @caller7._Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7._Mrcpc2() #16 { +; CHECK-LABEL: define i32 @caller7._Mrcpc2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7.default() #0 { +; CHECK-LABEL: define i32 @caller7.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names.default() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +attributes #0 = { "fmv-features" } +attributes #1 = { "fmv-features"="sve" } +attributes #2 = { "fmv-features"="sve2" } +attributes #3 = { "fmv-features"="mops" } +attributes #4 = { "fmv-features"="mops,sve2" } +attributes #5 = { "fmv-features"="sme" } +attributes #6 = { "fmv-features"="aes" } +attributes #7 = { "fmv-features"="sm4" } +attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" } +attributes #10 = { "fmv-features"="sve2-sha3" } +attributes #11 = { "fmv-features"="ls64,ssbs" } +attributes #12 = { "fmv-features"="flagm,lse,rng" } +attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" } +attributes #14 = { "fmv-features"="dpb2,frintts" } +attributes #15 = { "fmv-features"="flagm2,frintts" } +attributes #16 = { "fmv-features"="rcpc2" } +attributes #17 = { "fmv-features"="frintts" } diff --git a/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll new file mode 100644 index 0000000000000..6296954333e8a --- /dev/null +++ b/llvm/test/Transforms/InstCombine/fsqrtdiv-transform.ll @@ -0,0 +1,631 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes='instcombine' < %s | FileCheck %s + +@x = global double 0.000000e+00 +@r1 = global double 0.000000e+00 +@r2 = global double 0.000000e+00 +@r3 = global double 0.000000e+00 +@v = global [2 x double] zeroinitializer +@v1 = global [2 x double] zeroinitializer +@v2 = global [2 x double] zeroinitializer + +; div/mul/div1 in the same block. +define void @bb_constraint_case1(double %a) { +; CHECK-LABEL: define void @bb_constraint_case1( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; div/mul in one block and div1 in other block with conditional guard. +define void @bb_constraint_case2(double %a, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case2( +; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.end, label %if.then + +if.then: ; preds = %entry + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; div in one block. mul/div1 in other block and conditionally guarded. Don't optimize. +define void @bb_constraint_case3(double %a, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case3( +; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.end, label %if.then + +if.then: ; preds = %entry + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; div in one block. mul/div1 each in different block and conditionally guarded. Don't optimize. +define void @bb_constraint_case4(double %a, i32 %c, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case4( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: br i1 [[C_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_END1:%.*]], label [[IF_THEN1:%.*]] +; CHECK: if.then1: +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: br label [[IF_END1]] +; CHECK: if.end1: +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %c.not = icmp eq i32 %c, 0 + br i1 %c.not, label %if.end, label %if.then + +if.then: ; preds = %entry + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + br label %if.end + +if.end: ; preds = %if.then, %entry + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.end1, label %if.then1 + +if.then1: ; preds = %if.end + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + br label %if.end1 + +if.end1: ; preds = %if.then1, %if.end + ret void +} + +; sqrt value comes from different blocks. Don't optimize. +define void @bb_constraint_case5(double %a, i32 %c) { +; CHECK-LABEL: define void @bb_constraint_case5( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP0:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[A]], 1.000000e+01 +; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[ADD]]) +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[SQRT:%.*]] = phi double [ [[TMP0]], [[IF_THEN]] ], [ [[TMP1]], [[IF_ELSE]] ] +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %c.not = icmp eq i32 %c, 0 + br i1 %c.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %0 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + br label %if.end + +if.else: ; preds = %entry + %add = fadd double %a, 1.000000e+01 + %1 = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %add) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %sqrt = phi double[ %0, %if.then], [ %1, %if.else] + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; div in one block and conditionally guarded. mul/div1 in other block. Don't optimize. +define void @bb_constraint_case6(double %a, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case6( +; CHECK-SAME: double [[A:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @x, align 8 +; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP1:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[TMP1]], ptr @x, align 8 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[DIV:%.*]] = phi double [ [[TMP0]], [[IF_ELSE]] ], [ [[TMP1]], [[IF_THEN]] ] +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.else, label %if.then + +if.else: ; preds = %entry + %1 = load double, ptr @x + br label %if.end + +if.then: ; preds = %entry + %2 = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %2, ptr @x + br label %if.end + +if.end: ; preds = %if.else, %if.then + %div = phi double [ %1, %if.else ], [ %2, %if.then ] + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; value for mul comes from different blocks. Don't optimize. +define void @bb_constraint_case7(double %a, i32 %c, i32 %d) { +; CHECK-LABEL: define void @bb_constraint_case7( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: br i1 [[C_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[TMP0:%.*]] = fdiv double 3.000000e+00, [[A]] +; CHECK-NEXT: br label [[IF_END:%.*]] +; CHECK: if.else: +; CHECK-NEXT: [[D_NOT:%.*]] = icmp eq i32 [[D]], 0 +; CHECK-NEXT: br i1 [[D_NOT]], label [[IF_ELSE1:%.*]], label [[IF_THEN1:%.*]] +; CHECK: if.then1: +; CHECK-NEXT: [[TMP1:%.*]] = fdiv double 2.000000e+00, [[A]] +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.else1: +; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[MUL:%.*]] = phi double [ [[TMP1]], [[IF_THEN1]] ], [ [[TMP2]], [[IF_ELSE1]] ], [ [[TMP0]], [[IF_THEN]] ] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %c.not = icmp eq i32 %c, 0 + br i1 %c.not, label %if.else, label %if.then + +if.then: ; preds = %entry + %1 = fdiv double 3.000000e+00, %a + br label %if.end + +if.else: ; preds = %entry + %d.not = icmp eq i32 %d, 0 + br i1 %d.not, label %if.else1, label %if.then1 + +if.then1: ; preds = %if.else + %2 = fdiv double 2.000000e+00, %a + br label %if.end + +if.else1: ; preds = %if.else + %3 = fmul reassoc double %div, %div + br label %if.end + +if.end: ; preds = %if.then1, %if.else1, %if.then + %mul = phi double [ %2, %if.then1 ], [ %3, %if.else1 ], [ %1, %if.then ] + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; value of mul comes from two different blocks(as shown by select ins). +define void @bb_constraint_case8(double %a, i32 %c) { +; CHECK-LABEL: define void @bb_constraint_case8( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = fmul double [[A]], [[A]] +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP0]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %c.not = icmp eq i32 %c, 0 + %1 = fmul double %a, %a + %2 = fmul reassoc double %div, %div + %mul = select i1 %c.not, double %1, double %2 + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; multiple instances of multiply ops to optimize. Optimize all. +define void @mutiple_multiply_instances(double %a, i32 %c) { +; CHECK-LABEL: define void @mutiple_multiply_instances( +; CHECK-SAME: double [[A:%.*]], i32 [[C:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP1:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP1]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[C_NOT:%.*]] = icmp eq i32 [[C]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = fmul double [[A]], [[A]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul double [[A]], [[A]] +; CHECK-NEXT: [[MUL1:%.*]] = select i1 [[C_NOT]], double [[TMP2]], double [[TMP1]] +; CHECK-NEXT: [[MUL2:%.*]] = select i1 [[C_NOT]], double [[TMP1]], double [[TMP3]] +; CHECK-NEXT: store double [[MUL1]], ptr @r1, align 8 +; CHECK-NEXT: store double [[MUL2]], ptr @r3, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %c.not = icmp eq i32 %c, 0 + %1 = fmul double %a, %a + %2 = fmul double %a, %a + %3 = fmul reassoc double %div, %div + %4 = fmul reassoc double %div, %div + %mul1 = select i1 %c.not, double %1, double %3 + %mul2 = select i1 %c.not, double %4, double %2 + store double %mul1, ptr @r1 + store double %mul2, ptr @r3 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; missing flags for optimization. +define void @missing_arcp_flag_on_div(double %a) { +; CHECK-LABEL: define void @missing_arcp_flag_on_div( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; missing flags for optimization. +define void @missing_reassoc_flag_on_mul(double %a) { +; CHECK-LABEL: define void @missing_reassoc_flag_on_mul( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv reassoc double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; missing flags for optimization. +define void @missing_reassoc_flag_on_div1(double %a) { +; CHECK-LABEL: define void @missing_reassoc_flag_on_div1( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT:%.*]] = call reassoc nnan ninf nsz double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = fdiv reassoc ninf arcp double 1.000000e+00, [[SQRT]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[MUL:%.*]] = fmul reassoc double [[DIV]], [[DIV]] +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV1:%.*]] = fdiv double [[A]], [[SQRT]] +; CHECK-NEXT: store double [[DIV1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; div = -1/sqrt(a) +define void @negative_fdiv_val(double %a) { +; CHECK-LABEL: define void @negative_fdiv_val( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP1:%.*]] = fneg reassoc double [[SQRT1]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[TMP1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double -1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +define void @fpmath_metadata_on_div1(double %a) { +; CHECK-LABEL: define void @fpmath_metadata_on_div1( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3 + store double %div1, ptr @r2 + ret void +} + +define void @fpmath_metadata_on_mul(double %a) { +; CHECK-LABEL: define void @fpmath_metadata_on_mul( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1:![0-9]+]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt + store double %div, ptr @x + %mul = fmul reassoc double %div, %div, !fpmath !2 + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +; FIXME: DIV in the result should get the fpmath metadata from %div. +define void @fpmath_metadata_on_div(double %a) { +; CHECK-LABEL: define void @fpmath_metadata_on_div( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2:![0-9]+]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a) + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1 + store double %div, ptr @x + %mul = fmul reassoc double %div, %div + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt + store double %div1, ptr @r2 + ret void +} + +define void @fpmath_metadata_on_all(double %a) { +; CHECK-LABEL: define void @fpmath_metadata_on_all( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc double @llvm.sqrt.f64(double [[A]]), !fpmath [[META0]] +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc double 1.000000e+00, [[A]], !fpmath [[META1]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc double [[TMP0]], [[SQRT1]], !fpmath [[META2]] +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: store double [[TMP0]], ptr @r1, align 8 +; CHECK-NEXT: store double [[SQRT1]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf double @llvm.sqrt.f64(double %a), !fpmath !0 + %div = fdiv reassoc arcp ninf double 1.000000e+00, %sqrt, !fpmath !1 + store double %div, ptr @x + %mul = fmul reassoc double %div, %div, !fpmath !2 + store double %mul, ptr @r1 + %div1 = fdiv reassoc double %a, %sqrt, !fpmath !3 + store double %div1, ptr @r2 + ret void +} + +define void @vector_input(<2 x double> %a) { +; CHECK-LABEL: define void @vector_input( +; CHECK-SAME: <2 x double> [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SQRT1:%.*]] = call reassoc <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]]) +; CHECK-NEXT: [[TMP0:%.*]] = fdiv reassoc <2 x double> splat (double 1.000000e+00), [[A]] +; CHECK-NEXT: [[DIV:%.*]] = fmul reassoc <2 x double> [[TMP0]], [[SQRT1]] +; CHECK-NEXT: store <2 x double> [[DIV]], ptr @v, align 16 +; CHECK-NEXT: store <2 x double> [[TMP0]], ptr @v1, align 16 +; CHECK-NEXT: store <2 x double> [[SQRT1]], ptr @v2, align 16 +; CHECK-NEXT: ret void +; +entry: + %sqrt = call reassoc nnan nsz ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) + %div = fdiv reassoc arcp ninf <2 x double>, %sqrt + store <2 x double> %div, ptr @v + %mul = fmul reassoc <2 x double> %div, %div + store <2 x double> %mul, ptr @v1 + %div1 = fdiv reassoc <2 x double> %a, %sqrt + store <2 x double> %div1, ptr @v2 + ret void +} + +define void @strict_fp_metadata(double %a) { +; CHECK-LABEL: define void @strict_fp_metadata( +; CHECK-SAME: double [[A:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict") +; CHECK-NEXT: [[CALL:%.*]] = call double @llvm.sqrt.f64(double noundef [[A]]) +; CHECK-NEXT: [[DIV:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[CONV]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict") +; CHECK-NEXT: store double [[DIV]], ptr @x, align 8 +; CHECK-NEXT: [[MUL:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[DIV]], double [[DIV]], metadata !"round.dynamic", metadata !"fpexcept.strict") +; CHECK-NEXT: store double [[MUL]], ptr @r1, align 8 +; CHECK-NEXT: [[DIV2:%.*]] = call double @llvm.experimental.constrained.fdiv.f64(double [[A]], double [[CALL]], metadata !"round.dynamic", metadata !"fpexcept.strict") +; CHECK-NEXT: store double [[DIV2]], ptr @r2, align 8 +; CHECK-NEXT: ret void +; +entry: + %conv = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 1, metadata !"round.dynamic", metadata !"fpexcept.strict") + %call = call double @llvm.sqrt.f64(double noundef %a) + %div = call double @llvm.experimental.constrained.fdiv.f64(double %conv, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict") + store double %div, ptr @x + %mul = call double @llvm.experimental.constrained.fmul.f64(double %div, double %div, metadata !"round.dynamic", metadata !"fpexcept.strict") + store double %mul, ptr @r1 + %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %call, metadata !"round.dynamic", metadata !"fpexcept.strict") + store double %div2, ptr @r2 + ret void +} + +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) +declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) +declare double @llvm.sqrt.f64(double) +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) + +!0 = !{float 2.5} +!1 = !{float 3.5} +!2 = !{float 4.5} +!3 = !{float 5.5} +; CHECK: [[META0]] = !{float 5.500000e+00} +; CHECK: [[META1]] = !{float 4.500000e+00} +; CHECK: [[META2]] = !{float 3.500000e+00} diff --git a/llvm/test/Transforms/InstCombine/nsw.ll b/llvm/test/Transforms/InstCombine/nsw.ll index 329a47324f862..b00f2e58add78 100644 --- a/llvm/test/Transforms/InstCombine/nsw.ll +++ b/llvm/test/Transforms/InstCombine/nsw.ll @@ -415,3 +415,63 @@ define i8 @neg_nsw_mul_missing_nsw_on_mul(i8 %a1, i8 %a2, i8 %b) { %neg = sub nsw i8 0, %shl ret i8 %neg } + +; This could propagate nsw. + +define i16 @mul_nsw_reassoc_prop(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop( +; CHECK-NEXT: [[B:%.*]] = mul nsw i16 [[X:%.*]], 6 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul nsw i16 %x, 3 + %b = mul nsw i16 %a, 2 + ret i16 %b +} + +; This could propagate nsw. + +define i16 @mul_nsw_reassoc_prop_neg(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop_neg( +; CHECK-NEXT: [[B:%.*]] = mul nsw i16 [[X:%.*]], -2201 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul nsw i16 %x, -71 + %b = mul nsw i16 %a, 31 + ret i16 %b +} + +; Must not propagate nsw. + +define i16 @mul_nsw_reassoc_prop_no_nsw1(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw1( +; CHECK-NEXT: [[B:%.*]] = mul i16 [[X:%.*]], 6 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul i16 %x, 3 + %b = mul nsw i16 %a, 2 + ret i16 %b +} + +; Must not propagate nsw. + +define i16 @mul_nsw_reassoc_prop_no_nsw2(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop_no_nsw2( +; CHECK-NEXT: [[B:%.*]] = mul i16 [[X:%.*]], 6 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul nsw i16 %x, 3 + %b = mul i16 %a, 2 + ret i16 %b +} + +; Must not propagate nsw. + +define i16 @mul_nsw_reassoc_prop_overflow(i16 %x) { +; CHECK-LABEL: @mul_nsw_reassoc_prop_overflow( +; CHECK-NEXT: [[B:%.*]] = mul i16 [[X:%.*]], -31777 +; CHECK-NEXT: ret i16 [[B]] +; + %a = mul nsw i16 %x, 1023 + %b = mul nsw i16 %a, 33 + ret i16 %b +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index bf95622733461..05c0bc0761ea4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] @@ -1548,5 +1548,263 @@ end: ret void } +; Check vectorization on an interleaved load/store groups of factor 4 + +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } +%struct.xyzt = type { i32, i32, i32, i32 } + +define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP7]]) +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load , ptr [[TMP13]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC8]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 1 +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP14]]) +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP20]], [[TMP23]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP22]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +; Check vectorization on a reverse interleaved load/store groups of factor 4 + +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; +; int b = A[i].y - i; +; int c = A[i].z * i; +; int d = A[i].t << i; +; B[i].x = a; +; B[i].y = b; +; B[i].z = c; +; B[i].t = d; +; } + +define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{ +; CHECK-LABEL: @interleave_deinterleave_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub splat (i32 1023), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP11]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) +; CHECK-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) +; CHECK-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw [[REVERSE3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; CHECK-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; CHECK-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) +; CHECK-NEXT: [[REVERSE9:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE6]], [[REVERSE8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE7]], [[REVERSE9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC10]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]] +; +entry: + br label %for.body +for.cond.cleanup: ; preds = %for.body + ret void +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0 + %load1 = load i32, ptr %x, align 4 + %trunc = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %load1, %trunc + %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1 + %load2 = load i32, ptr %y, align 4 + %sub = sub nsw i32 %load2, %trunc + %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2 + %load3 = load i32, ptr %z, align 4 + %mul = mul nsw i32 %load3, %trunc + %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3 + %load4 = load i32, ptr %t, align 4 + %shl = shl nuw nsw i32 %load4, %trunc + %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0 + store i32 %add, ptr %x5, align 4 + %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1 + store i32 %sub, ptr %y8, align 4 + %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2 + store i32 %mul, ptr %z5, align 4 + %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3 + store i32 %shl, ptr %t8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +} attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 1a281fe7c6f7f..d4392bebdf37b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -529,3 +529,255 @@ for.inc: for.end: ret void } + +; Expected to contain interleave2/deinterleave2 instructions +; +; void masked_strided_factor4(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left1 = p[4*ix]; +; char right1 = p[4*ix + 1]; +; char left2 = p[4*ix + 2]; +; char right2 = p[4*ix + 3]; +; char max1 = max(left1, right1); +; char max2 = max(left2, right2); +; q[4*ix] = max1; +; q[4*ix + 1] = 0 - max1; +; q[4*ix + 2] = max2; +; q[4*ix + 3] = 0 - max2; +; } +; } +;} +define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SCALAR_TAIL_FOLDING-NEXT: entry: +; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: vector.body: +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALAR_TAIL_FOLDING: middle.block: +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING: scalar.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: for.body: +; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: if.then: +; SCALAR_TAIL_FOLDING-NEXT: [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]] +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: for.inc: +; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALAR_TAIL_FOLDING: for.end: +; SCALAR_TAIL_FOLDING-NEXT: ret void +; +; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; PREDICATED_TAIL_FOLDING-NEXT: entry: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_TAIL_FOLDING: vector.ph: +; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: vector.body: +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; PREDICATED_TAIL_FOLDING: middle.block: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING: scalar.ph: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: for.body: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; PREDICATED_TAIL_FOLDING: if.then: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; PREDICATED_TAIL_FOLDING: for.inc: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PREDICATED_TAIL_FOLDING: for.end: +; PREDICATED_TAIL_FOLDING-NEXT: ret void +; +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %idx0 = shl nuw nsw i32 %ix.024, 2 + %idx1 = add i32 %idx0, 1 + %idx2 = add i32 %idx0, 2 + %idx3 = add i32 %idx0, 3 + + %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0 + %0 = load i8, ptr %array1idx0, align 1 + %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1 + %1 = load i8, ptr %array1idx1, align 1 + %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2 + %2 = load i8, ptr %array1idx2, align 1 + %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3 + %3 = load i8, ptr %array1idx3, align 1 + + %cmp.i1 = icmp slt i8 %0, %1 + %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0 + %sub1 = sub i8 0, %spec.select.i1 + %cmp.i2 = icmp slt i8 %2, %3 + %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2 + %sub2 = sub i8 0, %spec.select.i2 + + %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0 + store i8 %spec.select.i1, ptr %array3idx0, align 1 + %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1 + store i8 %sub1, ptr %array3idx1, align 1 + %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2 + store i8 %spec.select.i2, ptr %array3idx2, align 1 + %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3 + store i8 %sub2, ptr %array3idx3, align 1 + + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index bda4839dead51..b1ff589fe51bf 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; FIXED-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> -; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> +; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; FIXED-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; SCALABLE-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -360,42 +360,42 @@ exit: define void @load_store_factor3_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; FIXED-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; SCALABLE-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -550,42 +550,42 @@ exit: define void @load_store_factor3_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; FIXED-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; SCALABLE-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -740,56 +740,75 @@ exit: define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; CHECK-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; CHECK-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; CHECK-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; CHECK-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -824,23 +843,23 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; CHECK-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor8( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> @@ -849,39 +868,39 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; FIXED-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; FIXED-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; FIXED-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; FIXED-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; FIXED-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; FIXED-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; FIXED-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; FIXED-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; FIXED-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; FIXED-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; FIXED-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FIXED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) +; FIXED-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) +; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) +; FIXED-NEXT: [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) +; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) +; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> +; FIXED-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> +; FIXED-NEXT: [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> +; FIXED-NEXT: [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> +; FIXED-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> +; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 2 +; FIXED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; FIXED-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -916,64 +935,83 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; FIXED-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; FIXED-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor8( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; SCALABLE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; SCALABLE-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; SCALABLE-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; SCALABLE-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; SCALABLE-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; SCALABLE-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; SCALABLE-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; SCALABLE-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; SCALABLE-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; SCALABLE-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; SCALABLE-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; SCALABLE-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; SCALABLE-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -1008,9 +1046,9 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; SCALABLE-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; SCALABLE-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1080,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1088,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 8 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 -; FIXED-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 -; FIXED-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 +; FIXED-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4 +; FIXED-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 16 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1184,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1192,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1263,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1271,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 4 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4 -; FIXED-NEXT: store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4 +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1367,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1375,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll index 9f8cf169c0593..809b69900731a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll @@ -11,6 +11,10 @@ ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \ ; RUN: -mtriple=riscv64 -mattr=+v,+f -S < %s| FileCheck %s --check-prefix=NO-VP +; FIXME: Fixed-order recurrence is not supported yet with EVL tail folding. +; The llvm.splice may occurs unexpected behavior if the evl of the +; second-to-last iteration is not VF*UF. + define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-LABEL: define void @first_order_recurrence( ; IF-EVL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[TC:%.*]]) #[[ATTR0:[0-9]+]] { @@ -27,31 +31,35 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() ; IF-EVL-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 ; IF-EVL-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 ; IF-EVL-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement poison, i32 33, i32 [[TMP11]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] ; IF-EVL: [[VECTOR_BODY]]: -; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP25:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP26:%.*]] = add zeroinitializer, [[TMP25]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP26]] +; IF-EVL-NEXT: [[TMP27:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP14]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, [[TMP27]], poison) ; IF-EVL-NEXT: [[TMP16:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) -; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP16]], [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP12]]) +; IF-EVL-NEXT: [[TMP19:%.*]] = add nsw [[TMP16]], [[VP_OP_LOAD]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP13]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP12]]) -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP19]], ptr [[TMP18]], i32 4, [[TMP27]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: @@ -172,6 +180,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() @@ -182,27 +191,30 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 ; IF-EVL-NEXT: [[TMP14:%.*]] = sub i32 [[TMP13]], 1 ; IF-EVL-NEXT: [[VECTOR_RECUR_INIT1:%.*]] = insertelement poison, i32 22, i32 [[TMP14]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] ; IF-EVL: [[VECTOR_BODY]]: -; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP16:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP32:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP33:%.*]] = add zeroinitializer, [[TMP32]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP33]] +; IF-EVL-NEXT: [[TMP34:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT4]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP16]] ; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, [[TMP34]], poison) ; IF-EVL-NEXT: [[TMP19]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) ; IF-EVL-NEXT: [[TMP20:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP19]], i32 -1) -; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP19]], [[TMP20]], splat (i1 true), i32 [[TMP15]]) +; IF-EVL-NEXT: [[TMP23:%.*]] = add nsw [[TMP19]], [[TMP20]] ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP16]] ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP21]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP22]], splat (i1 true), i32 [[TMP15]]) -; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP15]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP23]], ptr [[TMP22]], i32 4, [[TMP34]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] ; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: @@ -218,12 +230,12 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL: [[SCALAR_PH]]: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT6:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] ; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] ; IF-EVL: [[FOR_BODY]]: ; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] ; IF-EVL-NEXT: [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]] @@ -342,6 +354,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[TC]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TC]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; IF-EVL-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() @@ -356,30 +369,33 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4 ; IF-EVL-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1 ; IF-EVL-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement poison, i32 11, i32 [[TMP17]] +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer ; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]] ; IF-EVL: [[VECTOR_BODY]]: -; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR:%.*]] = phi [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[VP_OP_LOAD:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR2:%.*]] = phi [ [[VECTOR_RECUR_INIT1]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[VECTOR_RECUR4:%.*]] = phi [ [[VECTOR_RECUR_INIT3]], %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ] -; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[TC]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP19:%.*]] = add i64 [[EVL_BASED_IV]], 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; IF-EVL-NEXT: [[TMP39:%.*]] = call @llvm.stepvector.nxv4i64() +; IF-EVL-NEXT: [[TMP40:%.*]] = add zeroinitializer, [[TMP39]] +; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP40]] +; IF-EVL-NEXT: [[TMP41:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT6]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP19]] ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP20]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP21]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[VP_OP_LOAD]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP21]], i32 4, [[TMP41]], poison) ; IF-EVL-NEXT: [[TMP22]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR]], [[VP_OP_LOAD]], i32 -1) ; IF-EVL-NEXT: [[TMP23]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR2]], [[TMP22]], i32 -1) ; IF-EVL-NEXT: [[TMP24:%.*]] = call @llvm.vector.splice.nxv4i32( [[VECTOR_RECUR4]], [[TMP23]], i32 -1) -; IF-EVL-NEXT: [[VP_OP:%.*]] = call @llvm.vp.add.nxv4i32( [[TMP23]], [[TMP24]], splat (i1 true), i32 [[TMP18]]) -; IF-EVL-NEXT: [[VP_OP5:%.*]] = call @llvm.vp.add.nxv4i32( [[VP_OP]], [[TMP22]], splat (i1 true), i32 [[TMP18]]) +; IF-EVL-NEXT: [[TMP27:%.*]] = add nsw [[TMP23]], [[TMP24]] +; IF-EVL-NEXT: [[TMP42:%.*]] = add [[TMP27]], [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP19]] ; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0 -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP5]], ptr align 4 [[TMP26]], splat (i1 true), i32 [[TMP18]]) -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP18]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP42]], ptr [[TMP26]], i32 4, [[TMP41]]) +; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]] ; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: @@ -399,14 +415,14 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL: [[SCALAR_PH]]: ; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT9:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT10:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] ; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] ; IF-EVL: [[FOR_BODY]]: ; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT9]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT10]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] ; IF-EVL-NEXT: [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]] diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll index 572511a5ffb92..90671689f1dce 100644 --- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll +++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics=1 -S %s | FileCheck %s +; RUN: opt -p loop-vectorize -force-vector-width=2 -use-dereferenceable-at-point-semantics -S %s | FileCheck %s declare void @llvm.assume(i1) -define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -104,7 +104,7 @@ exit: ret void } -define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -181,7 +181,7 @@ exit: ret void } -define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -282,7 +282,7 @@ exit: ret void } -define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -383,7 +383,7 @@ exit: ret void } -define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute( ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -484,7 +484,7 @@ exit: ret void } -define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_not_known( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -585,7 +585,7 @@ exit: ret void } -define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_then_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_then_constant_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -682,7 +682,7 @@ exit: ret void } -define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_latch_constant_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_latch_constant_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -785,7 +785,7 @@ exit: ret void } -define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree { +define void @deref_assumption_in_header_variable_trip_count(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_variable_trip_count( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -890,7 +890,7 @@ exit: ret void } -define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -968,7 +968,7 @@ exit: ret void } -define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1063,7 +1063,7 @@ exit: ret void } -define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1142,7 +1142,7 @@ exit: } -define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr( ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1220,7 +1220,7 @@ exit: ret void } -define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1315,7 +1315,7 @@ exit: ret void } -define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree { +define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: [[ENTRY:.*]]: @@ -1410,8 +1410,8 @@ exit: ret void } -; %a may be freeed between the dereferenceable assumption and accesses. -; FIXME: It is not safe to use with -use-dereferenceable-at-point-semantics. +; %a may be freed between the dereferenceable assumption and accesses. +; It is not safe to use with -use-dereferenceable-at-point-semantics. define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; CHECK-LABEL: define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) { @@ -1422,16 +1422,29 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_ ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: ; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP15]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP13]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_IF1]]: +; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_CONTINUE2]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i32> [ [[TMP12]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], %[[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 @@ -1491,6 +1504,103 @@ exit: ret void } +; %a may be freed between the dereferenceable assumption and accesses. +; It is not safe to use with -use-dereferenceable-at-point-semantics. +define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %b, ptr noalias %c) nofree nosync { +; CHECK-LABEL: define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( +; CHECK-SAME: ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[A:%.*]] = call ptr @get_ptr() +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ] +; CHECK-NEXT: call void @may_free() +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = icmp sge <2 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_IF1]]: +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP10]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_CONTINUE2]]: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ [[TMP8]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], %[[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 +; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] +; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 +; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 +; CHECK-NEXT: br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]] +; CHECK: [[LOOP_THEN]]: +; CHECK-NEXT: [[L_A:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]] +; CHECK-NEXT: store i32 [[MERGE]], ptr [[GEP_C]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP35:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %a = call ptr @get_ptr() + call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ] + call void @may_free() + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv + %l.b = load i32, ptr %gep.b, align 4 + %c.1 = icmp sge i32 %l.b, 0 + br i1 %c.1, label %loop.latch, label %loop.then + +loop.then: + %l.a = load i32, ptr %a, align 4 + br label %loop.latch + +loop.latch: + %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ] + %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv + store i32 %merge, ptr %gep.c, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +declare ptr @get_ptr() declare void @may_free() ;. @@ -1528,4 +1638,6 @@ declare void @may_free() ; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]} ; CHECK: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} ; CHECK: [[LOOP33]] = distinct !{[[LOOP33]], [[META2]], [[META1]]} +; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]} +; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll new file mode 100644 index 0000000000000..362ec22600f92 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +%struct.xyzt = type { i32, i32, i32, i32 } +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } + +define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN9:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP13]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN9]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN9]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN9]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[LDN9]], 3 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], splat (i1 true), ptr [[TMP21]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds %struct.xyzt, ptr %a, i64 %iv + %a.0 = load i32, ptr %gep.a, align 4 + %gep.b = getelementptr inbounds %struct.xyzt, ptr %b, i64 %iv + %b.0 = load i32, ptr %gep.b, align 4 + %add = add nsw i32 %b.0, %a.0 + %gep.dst = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %iv + store i32 %add, ptr %gep.dst, align 4 + %gep.a.1 = getelementptr inbounds nuw i8, ptr %gep.a, i64 4 + %a.1 = load i32, ptr %gep.a.1, align 4 + %gep.b.1 = getelementptr inbounds nuw i8, ptr %gep.b, i64 4 + %b.1 = load i32, ptr %gep.b.1, align 4 + %sub = sub nsw i32 %a.1, %b.1 + %gep.dst.1 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 4 + store i32 %sub, ptr %gep.dst.1, align 4 + %gep.a.2 = getelementptr inbounds nuw i8, ptr %gep.a, i64 8 + %a.2 = load i32, ptr %gep.a.2, align 4 + %gep.b.2 = getelementptr inbounds nuw i8, ptr %gep.b, i64 8 + %b.2 = load i32, ptr %gep.b.2, align 4 + %shl = shl i32 %a.2, %b.2 + %gep.dst.2 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 8 + store i32 %shl, ptr %gep.dst.2, align 4 + %gep.a.3 = getelementptr inbounds nuw i8, ptr %gep.a, i64 12 + %a.3 = load i32, ptr %gep.a.3, align 4 + %gep.b.3 = getelementptr inbounds nuw i8, ptr %gep.b, i64 12 + %b.3 = load i32, ptr %gep.b.3, align 4 + %shr = ashr i32 %a.3, %b.3 + %gep.dst.3 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 12 + store i32 %shr, ptr %gep.dst.3, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp index bf9c597d8ac5e..736a36da97f57 100644 --- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp @@ -392,6 +392,7 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2); SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2); + SDValue Bcast = DAG->getNode(ISD::BITCAST, DL, FloatVT, Op0); SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0); SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0); @@ -423,8 +424,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) { EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value()))); EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value()))); + EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_Value()))); + EXPECT_TRUE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::i32)))); EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value()))); EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value()))); + EXPECT_FALSE(sd_match(Bcast, m_BitReverse(m_Value()))); + EXPECT_FALSE(sd_match(Bcast, m_BitCast(m_SpecificVT(MVT::f32)))); EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value()))); EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value()))); diff --git a/mlir/include/mlir/Target/SPIRV/Deserialization.h b/mlir/include/mlir/Target/SPIRV/Deserialization.h index e39258beeaac8..a346a7fd1e5f7 100644 --- a/mlir/include/mlir/Target/SPIRV/Deserialization.h +++ b/mlir/include/mlir/Target/SPIRV/Deserialization.h @@ -15,6 +15,7 @@ #include "mlir/IR/OwningOpRef.h" #include "mlir/Support/LLVM.h" +#include namespace mlir { class MLIRContext; diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir index 390ecabaef21b..4e4b9e5698fe9 100644 --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -1368,6 +1368,77 @@ llvm.func @omp_atomic_read(%arg0 : !llvm.ptr, %arg1 : !llvm.ptr) -> () { // ----- +// CHECK-LABEL: @omp_atomic_read_implicit_cast +llvm.func @omp_atomic_read_implicit_cast () { +//CHECK: %[[Z:.*]] = alloca float, i64 1, align 4 +//CHECK: %[[Y:.*]] = alloca double, i64 1, align 8 +//CHECK: %[[X:.*]] = alloca [2 x { float, float }], i64 1, align 8 +//CHECK: %[[W:.*]] = alloca i32, i64 1, align 4 +//CHECK: %[[X_ELEMENT:.*]] = getelementptr { float, float }, ptr %3, i64 0 + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x f32 {bindc_name = "z"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x f64 {bindc_name = "y"} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(1 : i64) : i64 + %5 = llvm.alloca %4 x !llvm.array<2 x struct<(f32, f32)>> {bindc_name = "x"} : (i64) -> !llvm.ptr + %6 = llvm.mlir.constant(1 : i64) : i64 + %7 = llvm.alloca %6 x i32 {bindc_name = "w"} : (i64) -> !llvm.ptr + %8 = llvm.mlir.constant(1 : index) : i64 + %9 = llvm.mlir.constant(2 : index) : i64 + %10 = llvm.mlir.constant(1 : i64) : i64 + %11 = llvm.mlir.constant(0 : i64) : i64 + %12 = llvm.sub %8, %10 overflow : i64 + %13 = llvm.mul %12, %10 overflow : i64 + %14 = llvm.mul %13, %10 overflow : i64 + %15 = llvm.add %14, %11 overflow : i64 + %16 = llvm.mul %10, %9 overflow : i64 + %17 = llvm.getelementptr %5[%15] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(f32, f32)> + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = alloca { float, float }, align 8 +//CHECK: call void @__atomic_load(i64 8, ptr %[[X_ELEMENT]], ptr %[[ATOMIC_LOAD_TEMP]], i32 0) +//CHECK: %[[LOAD:.*]] = load { float, float }, ptr %[[ATOMIC_LOAD_TEMP]], align 8 +//CHECK: %[[EXT:.*]] = extractvalue { float, float } %[[LOAD]], 0 +//CHECK: store float %[[EXT]], ptr %[[Y]], align 4 + omp.atomic.read %3 = %17 : !llvm.ptr, !llvm.ptr, !llvm.struct<(f32, f32)> + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4 +//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float +//CHECK: %[[LOAD:.*]] = fpext float %[[CAST]] to double +//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8 + omp.atomic.read %3 = %1 : !llvm.ptr, !llvm.ptr, f32 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4 +//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to double +//CHECK: store double %[[LOAD]], ptr %[[Y]], align 8 + omp.atomic.read %3 = %7 : !llvm.ptr, !llvm.ptr, i32 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4 +//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double +//CHECK: %[[LOAD:.*]] = fptrunc double %[[CAST]] to float +//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4 + omp.atomic.read %1 = %3 : !llvm.ptr, !llvm.ptr, f64 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[W]] monotonic, align 4 +//CHECK: %[[LOAD:.*]] = sitofp i32 %[[ATOMIC_LOAD_TEMP]] to float +//CHECK: store float %[[LOAD]], ptr %[[Z]], align 4 + omp.atomic.read %1 = %7 : !llvm.ptr, !llvm.ptr, i32 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i64, ptr %[[Y]] monotonic, align 4 +//CHECK: %[[CAST:.*]] = bitcast i64 %[[ATOMIC_LOAD_TEMP]] to double +//CHECK: %[[LOAD:.*]] = fptosi double %[[CAST]] to i32 +//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4 + omp.atomic.read %7 = %3 : !llvm.ptr, !llvm.ptr, f64 + +//CHECK: %[[ATOMIC_LOAD_TEMP:.*]] = load atomic i32, ptr %[[Z]] monotonic, align 4 +//CHECK: %[[CAST:.*]] = bitcast i32 %[[ATOMIC_LOAD_TEMP]] to float +//CHECK: %[[LOAD:.*]] = fptosi float %[[CAST]] to i32 +//CHECK: store i32 %[[LOAD]], ptr %[[W]], align 4 + omp.atomic.read %7 = %1 : !llvm.ptr, !llvm.ptr, f32 + llvm.return +} + +// ----- + // CHECK-LABEL: @omp_atomic_write // CHECK-SAME: (ptr %[[x:.*]], i32 %[[expr:.*]]) llvm.func @omp_atomic_write(%x: !llvm.ptr, %expr: i32) -> () { diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake index d72b620ae3080..69bef91b2ce49 100644 --- a/offload/cmake/caches/AMDGPUBot.cmake +++ b/offload/cmake/caches/AMDGPUBot.cmake @@ -1,17 +1,19 @@ -# This file is meant for test builds on one basic AMDGPU buildbot only. +# This file is used across all AMDGPU-cmake builders # Install directory set to /tmp as this is a bot config set(CMAKE_INSTALL_PREFIX /tmp/llvm.install.test CACHE STRING "") +# General settings +set(CMAKE_BUILD_TYPE Release CACHE STRING "") +set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "") +set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "") + set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "") set(LLVM_ENABLE_RUNTIMES "compiler-rt;openmp;offload" CACHE STRING "") + set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "") -set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "") set(LLVM_TARGETS_TO_BUILD "host;AMDGPU" CACHE STRING "") +set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 32" CACHE STRING "") set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "") - -set(CMAKE_BUILD_TYPE Release CACHE STRING "") -set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "") -set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "") diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S index cc5344cdd124a..0bf9f07a13f14 100644 --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -19,6 +19,16 @@ #if KMP_ARCH_X86 || KMP_ARCH_X86_64 +# if defined(__ELF__) && defined(__CET__) && defined(__has_include) +# if __has_include() +# include +# endif +# endif + +# if !defined(_CET_ENDBR) +# define _CET_ENDBR +# endif + # if KMP_MIC // the 'delay r16/r32/r64' should be used instead of the 'pause'. // The delay operation has the effect of removing the current thread from @@ -66,6 +76,7 @@ ALIGN 4 .globl KMP_PREFIX_UNDERSCORE($0) KMP_PREFIX_UNDERSCORE($0): + _CET_ENDBR .endmacro # else // KMP_OS_DARWIN # define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols @@ -92,6 +103,7 @@ KMP_PREFIX_UNDERSCORE($0): .globl KMP_PREFIX_UNDERSCORE(\proc) KMP_PREFIX_UNDERSCORE(\proc): .cfi_startproc + _CET_ENDBR .endm .macro KMP_CFI_DEF_OFFSET sz .cfi_def_cfa_offset \sz diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index f72babb646a85..2286d4cd35e08 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -700,6 +700,7 @@ cc_library( includes = ["include"], textual_hdrs = [ # keep sorted + "include/clang/Basic/AllDiagnosticKinds.inc", "include/clang/Basic/AttrHasAttributeImpl.inc", "include/clang/Basic/AttrList.inc", "include/clang/Basic/AttrSubMatchRulesList.inc",