diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 198a8d8bf48f8..3818e3d66c994 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -533,9 +533,7 @@ class MCPlusBuilder { return Analysis->isReturn(Inst); } - virtual bool isTerminator(const MCInst &Inst) const { - return Analysis->isTerminator(Inst); - } + virtual bool isTerminator(const MCInst &Inst) const; virtual bool isNoop(const MCInst &Inst) const { llvm_unreachable("not implemented"); diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index 5b14ad5cdb880..f22b35a90bc5d 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -12,12 +12,14 @@ #include "bolt/Core/MCPlusBuilder.h" #include "bolt/Core/MCPlus.h" +#include "bolt/Utils/CommandLineOpts.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include #include @@ -28,6 +30,13 @@ using namespace llvm; using namespace bolt; using namespace MCPlus; +namespace opts { +cl::opt + TerminalTrap("terminal-trap", + cl::desc("Assume that execution stops at trap instruction"), + cl::init(true), cl::Hidden, cl::cat(BoltCategory)); +} + bool MCPlusBuilder::equals(const MCInst &A, const MCInst &B, CompFuncTy Comp) const { if (A.getOpcode() != B.getOpcode()) @@ -121,6 +130,11 @@ bool MCPlusBuilder::equals(const MCTargetExpr &A, const MCTargetExpr &B, llvm_unreachable("target-specific expressions are unsupported"); } +bool MCPlusBuilder::isTerminator(const MCInst &Inst) const { + return Analysis->isTerminator(Inst) || + (opts::TerminalTrap && Info->get(Inst.getOpcode()).isTrap()); +} + void MCPlusBuilder::setTailCall(MCInst &Inst) const { assert(!hasAnnotation(Inst, MCAnnotation::kTailCall)); setAnnotationOpValue(Inst, MCAnnotation::kTailCall, true); diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 2ead51ff6a128..5ca5594117a62 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -84,6 +84,7 @@ extern cl::opt JumpTables; extern cl::opt KeepNops; extern cl::list ReorderData; extern cl::opt ReorderFunctions; +extern cl::opt TerminalTrap; extern cl::opt TimeBuild; cl::opt AllowStripped("allow-stripped", @@ -2033,8 +2034,14 @@ void RewriteInstance::adjustCommandLineOptions() { if (opts::Lite) BC->outs() << "BOLT-INFO: enabling lite mode\n"; - if (BC->IsLinuxKernel && !opts::KeepNops.getNumOccurrences()) - opts::KeepNops = true; + if (BC->IsLinuxKernel) { + if (!opts::KeepNops.getNumOccurrences()) + opts::KeepNops = true; + + // Linux kernel may resume execution after a trap instruction in some cases. + if (!opts::TerminalTrap.getNumOccurrences()) + opts::TerminalTrap = false; + } } namespace { diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 15f95f8217776..8b1894953f375 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -211,13 +211,6 @@ class X86MCPlusBuilder : public MCPlusBuilder { return false; } - // FIXME: For compatibility with old LLVM only! - bool isTerminator(const MCInst &Inst) const override { - unsigned Opcode = Inst.getOpcode(); - return Info->get(Opcode).isTerminator() || X86::isUD1(Opcode) || - X86::isUD2(Opcode); - } - bool isIndirectCall(const MCInst &Inst) const override { return isCall(Inst) && ((getMemoryOperandNo(Inst) != -1) || Inst.getOperand(0).isReg()); diff --git a/bolt/test/X86/linux-bug-table.s b/bolt/test/X86/linux-bug-table.s index f688a60c97719..63f70a0b35d9f 100644 --- a/bolt/test/X86/linux-bug-table.s +++ b/bolt/test/X86/linux-bug-table.s @@ -40,6 +40,10 @@ _start: # CHECK-REOPT-SAME: BugEntry: 2 ret +## The return instruction is reachable only via preceding ud2. Test that it is +## treated as a reachable instruction in the Linux kernel mode. + +# CHECK-REOPT-NEXT: ret .size _start, .-_start diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake index 2a0953af53fad..736a54ece550c 100644 --- a/clang/cmake/caches/CrossWinToARMLinux.cmake +++ b/clang/cmake/caches/CrossWinToARMLinux.cmake @@ -29,6 +29,11 @@ # cmake --build . --target check-cxxabi- # cmake --build . --target check-unwind- # cmake --build . --target check-cxx- +# (another way to execute the tests) +# python bin/llvm-lit.py -v --threads=32 runtimes/runtimes-bins/libunwind/test 2>&1 | tee libunwind-tests.log +# python bin/llvm-lit.py -v --threads=32 runtimes/runtimes--bins/libcxxabi/test 2>&1 | tee libcxxabi-tests.log +# python bin/llvm-lit.py -v --threads=32 runtimes/runtimes--bins/libcxx/test 2>&1 | tee libcxx-tests.log + # LLVM_PROJECT_DIR is the path to the llvm-project directory. # The right way to compute it would probably be to use "${CMAKE_SOURCE_DIR}/../", @@ -42,9 +47,6 @@ if (NOT DEFINED DEFAULT_SYSROOT) message(WARNING "DEFAULT_SYSROOT must be specified for the cross toolchain build.") endif() -if (NOT DEFINED LLVM_TARGETS_TO_BUILD) - set(LLVM_TARGETS_TO_BUILD "ARM" CACHE STRING "") -endif() if (NOT DEFINED LLVM_ENABLE_ASSERTIONS) set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "") endif() @@ -56,7 +58,7 @@ if (NOT DEFINED LLVM_ENABLE_RUNTIMES) endif() if (NOT DEFINED TOOLCHAIN_TARGET_TRIPLE) - set(TOOLCHAIN_TARGET_TRIPLE "armv7-unknown-linux-gnueabihf") + set(TOOLCHAIN_TARGET_TRIPLE "aarch64-unknown-linux-gnu") else() #NOTE: we must normalize specified target triple to a fully specified triple, # including the vendor part. It is necessary to synchronize the runtime library @@ -74,24 +76,38 @@ else() string(REPLACE ";" "-" TOOLCHAIN_TARGET_TRIPLE "${TOOLCHAIN_TARGET_TRIPLE}") endif() +message(STATUS "Toolchain target triple: ${TOOLCHAIN_TARGET_TRIPLE}") + +if (NOT DEFINED LLVM_TARGETS_TO_BUILD) + if ("${TOOLCHAIN_TARGET_TRIPLE}" MATCHES "^(armv|arm32)+") + set(LLVM_TARGETS_TO_BUILD "ARM" CACHE STRING "") + endif() + if ("${TOOLCHAIN_TARGET_TRIPLE}" MATCHES "^(aarch64|arm64)+") + set(LLVM_TARGETS_TO_BUILD "AArch64" CACHE STRING "") + endif() +endif() + +message(STATUS "Toolchain target to build: ${LLVM_TARGETS_TO_BUILD}") + if (NOT DEFINED CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "") endif() -message(STATUS "Toolchain target triple: ${TOOLCHAIN_TARGET_TRIPLE}") - set(CMAKE_CROSSCOMPILING ON CACHE BOOL "") set(CMAKE_CL_SHOWINCLUDES_PREFIX "Note: including file: " CACHE STRING "") # Required if COMPILER_RT_DEFAULT_TARGET_ONLY is ON set(CMAKE_C_COMPILER_TARGET "${TOOLCHAIN_TARGET_TRIPLE}" CACHE STRING "") set(CMAKE_CXX_COMPILER_TARGET "${TOOLCHAIN_TARGET_TRIPLE}" CACHE STRING "") -set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(LLVM_DEFAULT_TARGET_TRIPLE "${TOOLCHAIN_TARGET_TRIPLE}" CACHE STRING "") set(LLVM_TARGET_ARCH "${TOOLCHAIN_TARGET_TRIPLE}" CACHE STRING "") set(LLVM_LIT_ARGS "-vv ${LLVM_LIT_ARGS}" CACHE STRING "" FORCE) +set(CLANG_DEFAULT_CXX_STDLIB "libc++" CACHE STRING "") set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "") +set(CLANG_DEFAULT_OBJCOPY "llvm-objcopy" CACHE STRING "") +set(CLANG_DEFAULT_RTLIB "compiler-rt" CACHE STRING "") +set(CLANG_DEFAULT_UNWINDLIB "libunwind" CACHE STRING "") if(WIN32) set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded" CACHE STRING "") @@ -109,9 +125,10 @@ set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_SYSTEM_NAME set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_SYSROOT "${DEFAULT_SYSROOT}" CACHE STRING "") set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_INSTALL_RPATH "${RUNTIMES_INSTALL_RPATH}" CACHE STRING "") set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_BUILD_WITH_INSTALL_RPATH ON CACHE BOOL "") - +set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_LLVM_CMAKE_DIR "${LLVM_PROJECT_DIR}/llvm/cmake/modules" CACHE PATH "") set(LLVM_RUNTIME_TARGETS "${TOOLCHAIN_TARGET_TRIPLE}" CACHE STRING "") +set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LLVM_ENABLE_RUNTIMES "${LLVM_ENABLE_RUNTIMES}" CACHE STRING "") @@ -125,13 +142,16 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_SANITIZERS set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_XRAY OFF CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_LIBFUZZER OFF CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_PROFILE OFF CACHE BOOL "") -set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_CRT OFF CACHE BOOL "") +set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_CRT ON CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_ORC OFF CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_DEFAULT_TARGET_ONLY ON CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_INCLUDE_TESTS ON CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_CAN_EXECUTE_TESTS ON CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_USE_BUILTINS_LIBRARY ON CACHE BOOL "") +set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_CXX_LIBRARY libcxx CACHE STRING "") +# Tell Clang to seach C++ headers alongside with the just-built binaries for the C++ compiler-rt tests. +set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_TEST_COMPILER_CFLAGS "--stdlib=libc++" CACHE STRING "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_USE_COMPILER_RT ON CACHE BOOL "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_ENABLE_SHARED OFF CACHE BOOL "") @@ -148,8 +168,10 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ABI_VERSION set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_CXX_ABI "libcxxabi" CACHE STRING "") #!!! set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS ON CACHE BOOL "") - +# Avoid searching for the python3 interpreter during the runtimes configuration for the cross builds. +# It starts searching the python3 package using the target's sysroot path, that usually is not compatible with the build host. find_package(Python3 COMPONENTS Interpreter) +set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_Python3_EXECUTABLE ${Python3_EXECUTABLE} CACHE PATH "") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBUNWIND_TEST_PARAMS_default "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_TEST_PARAMS}") set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_TEST_PARAMS_default "${RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_TEST_PARAMS}") diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6a69e86ce1ae6..b1dbcc72da96b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -238,21 +238,6 @@ Attribute Changes in Clang added a new extension query ``__has_extension(swiftcc)`` corresponding to the ``__attribute__((swiftcc))`` attribute. -- The ``_Nullable`` and ``_Nonnull`` family of type attributes can now apply - to certain C++ class types, such as smart pointers: - ``void useObject(std::unique_ptr _Nonnull obj);``. - - This works for standard library types including ``unique_ptr``, ``shared_ptr``, - and ``function``. See - `the attribute reference documentation `_ - for the full list. - -- The ``_Nullable`` attribute can be applied to C++ class declarations: - ``template class _Nullable MySmartPointer {};``. - - This allows the ``_Nullable`` and ``_Nonnull`` family of type attributes to - apply to this class. - Improvements to Clang's diagnostics ----------------------------------- - Clang now applies syntax highlighting to the code snippets it @@ -307,6 +292,9 @@ Improvements to Clang's diagnostics - ``-Wmicrosoft``, ``-Wgnu``, or ``-pedantic`` is now required to diagnose C99 flexible array members in a union or alone in a struct. Fixes GH#84565. +- Clang now no longer diagnoses type definitions in ``offsetof`` in C23 mode. + Fixes #GH83658. + Improvements to Clang's time-trace ---------------------------------- @@ -357,6 +345,9 @@ Bug Fixes in This Version - Fixes an assertion failure on invalid code when trying to define member functions in lambdas. +- Fixed a regression in CTAD that a friend declaration that befriends itself may cause + incorrect constraint substitution. (#GH86769). + Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 47ed6d0d1db0d..858450926455c 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -669,9 +669,8 @@ class alignas(8) Decl { /// Whether this declaration comes from another module unit. bool isInAnotherModuleUnit() const; - /// FIXME: Implement discarding declarations actually in global module - /// fragment. See [module.global.frag]p3,4 for details. - bool isDiscardedInGlobalModuleFragment() const { return false; } + /// Whether this declaration comes from explicit global module. + bool isFromExplicitGlobalModule() const; /// Check if we should skip checking ODRHash for declaration \param D. /// diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 6584460cf5685..80e607525a0a3 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2178,10 +2178,9 @@ def TypeNonNull : TypeAttr { let Documentation = [TypeNonNullDocs]; } -def TypeNullable : DeclOrTypeAttr { +def TypeNullable : TypeAttr { let Spellings = [CustomKeyword<"_Nullable">]; let Documentation = [TypeNullableDocs]; -// let Subjects = SubjectList<[CXXRecord], ErrorDiag>; } def TypeNullableResult : TypeAttr { diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 0ca4ea377fc36..3ea4d676b4f89 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -4151,20 +4151,6 @@ non-underscored keywords. For example: @property (assign, nullable) NSView *superview; @property (readonly, nonnull) NSArray *subviews; @end - -As well as built-in pointer types, the nullability attributes can be attached -to C++ classes marked with the ``_Nullable`` attribute. - -The following C++ standard library types are considered nullable: -``unique_ptr``, ``shared_ptr``, ``auto_ptr``, ``exception_ptr``, ``function``, -``move_only_function`` and ``coroutine_handle``. - -Types should be marked nullable only where the type itself leaves nullability -ambiguous. For example, ``std::optional`` is not marked ``_Nullable``, because -``optional _Nullable`` is redundant and ``optional _Nonnull`` is -not a useful type. ``std::weak_ptr`` is not nullable, because its nullability -can change with no visible modification, so static annotation is unlikely to be -unhelpful. }]; } @@ -4199,17 +4185,6 @@ The ``_Nullable`` nullability qualifier indicates that a value of the int fetch_or_zero(int * _Nullable ptr); a caller of ``fetch_or_zero`` can provide null. - -The ``_Nullable`` attribute on classes indicates that the given class can -represent null values, and so the ``_Nullable``, ``_Nonnull`` etc qualifiers -make sense for this type. For example: - - .. code-block:: c - - class _Nullable ArenaPointer { ... }; - - ArenaPointer _Nonnull x = ...; - ArenaPointer _Nullable y = nullptr; }]; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 5c6e62e59721d..7049a6f0eaba4 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -1748,8 +1748,8 @@ def err_type_defined_in_condition : Error< def err_type_defined_in_enum : Error< "%0 cannot be defined in an enumeration">; def ext_type_defined_in_offsetof : Extension< - "defining a type within '%select{__builtin_offsetof|offsetof}0' is a Clang " - "extension">, InGroup; + "defining a type within '%select{__builtin_offsetof|offsetof}0' is a C23 " + "extension">, InGroup; def note_pure_virtual_function : Note< "unimplemented pure virtual method %0 in %1">; diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def index fe4d1c4afcca6..b41aadc73f205 100644 --- a/clang/include/clang/Basic/Features.def +++ b/clang/include/clang/Basic/Features.def @@ -94,7 +94,6 @@ EXTENSION(define_target_os_macros, FEATURE(enumerator_attributes, true) FEATURE(nullability, true) FEATURE(nullability_on_arrays, true) -FEATURE(nullability_on_classes, true) FEATURE(nullability_nullable_result, true) FEATURE(memory_sanitizer, LangOpts.Sanitize.hasOneOf(SanitizerKind::Memory | diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 374595edd2ce4..e1ef7454f0166 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -267,6 +267,9 @@ class TargetInfo : public TransferrableTargetInfo, LLVM_PREFERRED_TYPE(bool) unsigned AllowAMDGPUUnsafeFPAtomics : 1; + LLVM_PREFERRED_TYPE(bool) + unsigned HasUnalignedAccess : 1; + unsigned ARMCDECoprocMask : 8; unsigned MaxOpenCLWorkGroupSize; @@ -859,6 +862,18 @@ class TargetInfo : public TransferrableTargetInfo, return PointerWidth; } + /// Return true iff unaligned accesses are a single instruction (rather than + /// a synthesized sequence). + bool hasUnalignedAccess() const { return HasUnalignedAccess; } + + /// Return true iff unaligned accesses are cheap. This affects placement and + /// size of bitfield loads/stores. (Not the ABI-mandated placement of + /// the bitfields themselves.) + bool hasCheapUnalignedBitFieldAccess() const { + // Simply forward to the unaligned access getter. + return hasUnalignedAccess(); + } + /// \brief Returns the default value of the __USER_LABEL_PREFIX__ macro, /// which is the prefix given to user symbols by default. /// diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 3c896bffe9666..a46803a348739 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3712,6 +3712,9 @@ defm preserve_as_comments : BoolFOption<"preserve-as-comments", "Do not preserve comments in inline assembly">, PosFlag>; def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>; +def reexport_framework : Separate<["-"], "reexport_framework">, Flags<[LinkerInput]>; +def reexport_l : Joined<["-"], "reexport-l">, Flags<[LinkerInput]>; +def reexport_library : JoinedOrSeparate<["-"], "reexport_library">, Flags<[LinkerInput]>; def frandom_seed_EQ : Joined<["-"], "frandom-seed=">, Group; def freg_struct_return : Flag<["-"], "freg-struct-return">, Group, Visibility<[ClangOption, CC1Option]>, diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index cce91862ae3d0..3464654284f19 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -133,6 +133,24 @@ class CompilerInstance : public ModuleLoader { std::vector> DependencyCollectors; + /// Records the set of modules + class FailedModulesSet { + llvm::StringSet<> Failed; + + public: + bool hasAlreadyFailed(StringRef module) { return Failed.count(module) > 0; } + + void addFailed(StringRef module) { Failed.insert(module); } + }; + + /// The set of modules that failed to build. + /// + /// This pointer will be shared among all of the compiler instances created + /// to (re)build modules, so that once a module fails to build anywhere, + /// other instances will see that the module has failed and won't try to + /// build it again. + std::shared_ptr FailedModules; + /// The set of top-level modules that has already been built on the /// fly as part of this overall compilation action. std::map> BuiltModules; @@ -619,6 +637,24 @@ class CompilerInstance : public ModuleLoader { } /// @} + /// @name Failed modules set + /// @{ + + bool hasFailedModulesSet() const { return (bool)FailedModules; } + + void createFailedModulesSet() { + FailedModules = std::make_shared(); + } + + std::shared_ptr getFailedModulesSetPtr() const { + return FailedModules; + } + + void setFailedModulesSet(std::shared_ptr FMS) { + FailedModules = FMS; + } + + /// } /// @name Output Files /// @{ diff --git a/clang/include/clang/Frontend/FrontendActions.h b/clang/include/clang/Frontend/FrontendActions.h index a620ddfc40447..0518a8823a03e 100644 --- a/clang/include/clang/Frontend/FrontendActions.h +++ b/clang/include/clang/Frontend/FrontendActions.h @@ -34,12 +34,18 @@ class InitOnlyAction : public FrontendAction { /// Preprocessor-based frontend action that also loads PCH files. class ReadPCHAndPreprocessAction : public FrontendAction { + llvm::unique_function AdjustCI; + void ExecuteAction() override; std::unique_ptr CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override; public: + ReadPCHAndPreprocessAction( + llvm::unique_function AdjustCI) + : AdjustCI(std::move(AdjustCI)) {} + bool usesPreprocessorOnly() const override { return false; } }; @@ -321,11 +327,15 @@ class PrintPreprocessedAction : public PreprocessorFrontendAction { class GetDependenciesByModuleNameAction : public PreprocessOnlyAction { StringRef ModuleName; + llvm::unique_function AdjustCI; + void ExecuteAction() override; public: - GetDependenciesByModuleNameAction(StringRef ModuleName) - : ModuleName(ModuleName) {} + GetDependenciesByModuleNameAction( + StringRef ModuleName, + llvm::unique_function AdjustCI) + : ModuleName(ModuleName), AdjustCI(std::move(AdjustCI)) {} }; } // end namespace clang diff --git a/clang/include/clang/InstallAPI/DylibVerifier.h b/clang/include/clang/InstallAPI/DylibVerifier.h index 49de24763f1f9..22cdc234486cf 100644 --- a/clang/include/clang/InstallAPI/DylibVerifier.h +++ b/clang/include/clang/InstallAPI/DylibVerifier.h @@ -31,6 +31,7 @@ enum class VerificationMode { class DylibVerifier : llvm::MachO::RecordVisitor { private: struct SymbolContext; + struct DWARFContext; public: enum class Result { NoVerify, Ignore, Valid, Invalid }; @@ -54,7 +55,7 @@ class DylibVerifier : llvm::MachO::RecordVisitor { DiagnosticsEngine *Diag = nullptr; // Handle diagnostics reporting for target level violations. - void emitDiag(llvm::function_ref Report); + void emitDiag(llvm::function_ref Report, RecordLoc *Loc = nullptr); VerifierContext() = default; VerifierContext(DiagnosticsEngine *Diag) : Diag(Diag) {} @@ -63,9 +64,10 @@ class DylibVerifier : llvm::MachO::RecordVisitor { DylibVerifier() = default; DylibVerifier(llvm::MachO::Records &&Dylib, DiagnosticsEngine *Diag, - VerificationMode Mode, bool Demangle) + VerificationMode Mode, bool Demangle, StringRef DSYMPath) : Dylib(std::move(Dylib)), Mode(Mode), Demangle(Demangle), - Exports(std::make_unique()), Ctx(VerifierContext{Diag}) {} + DSYMPath(DSYMPath), Exports(std::make_unique()), + Ctx(VerifierContext{Diag}) {} Result verify(GlobalRecord *R, const FrontendAttrs *FA); Result verify(ObjCInterfaceRecord *R, const FrontendAttrs *FA); @@ -143,6 +145,12 @@ class DylibVerifier : llvm::MachO::RecordVisitor { std::string getAnnotatedName(const Record *R, SymbolContext &SymCtx, bool ValidSourceLoc = true); + /// Extract source location for symbol implementations. + /// As this is a relatively expensive operation, it is only used + /// when there is a violation to report and there is not a known declaration + /// in the interface. + void accumulateSrcLocForDylibSymbols(); + // Symbols in dylib. llvm::MachO::Records Dylib; @@ -152,11 +160,17 @@ class DylibVerifier : llvm::MachO::RecordVisitor { // Attempt to demangle when reporting violations. bool Demangle = false; + // File path to DSYM file. + StringRef DSYMPath; + // Valid symbols in final text file. std::unique_ptr Exports = std::make_unique(); // Track current state of verification while traversing AST. VerifierContext Ctx; + + // Track DWARF provided source location for dylibs. + DWARFContext *DWARFCtx = nullptr; }; } // namespace installapi diff --git a/clang/include/clang/InstallAPI/MachO.h b/clang/include/clang/InstallAPI/MachO.h index 4961c596fd68a..827220dbf39fb 100644 --- a/clang/include/clang/InstallAPI/MachO.h +++ b/clang/include/clang/InstallAPI/MachO.h @@ -34,6 +34,7 @@ using ObjCCategoryRecord = llvm::MachO::ObjCCategoryRecord; using ObjCIVarRecord = llvm::MachO::ObjCIVarRecord; using ObjCIFSymbolKind = llvm::MachO::ObjCIFSymbolKind; using Records = llvm::MachO::Records; +using RecordLoc = llvm::MachO::RecordLoc; using RecordsSlice = llvm::MachO::RecordsSlice; using BinaryAttrs = llvm::MachO::RecordsSlice::BinaryAttrs; using SymbolSet = llvm::MachO::SymbolSet; diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 0836b7d439bb0..24e146a589a75 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -736,6 +736,19 @@ class Preprocessor { State ConditionalStackState = Off; } PreambleConditionalStack; + /// Function for getting the dependency preprocessor directives of a file. + /// + /// These are directives derived from a special form of lexing where the + /// source input is scanned for the preprocessor directives that might have an + /// effect on the dependencies for a compilation unit. + /// + /// Enables a client to cache the directives for a file and provide them + /// across multiple compiler invocations. + /// FIXME: Allow returning an error. + using DependencyDirectivesFn = llvm::unique_function>(FileEntryRef)>; + DependencyDirectivesFn DependencyDirectivesForFile; + /// The current top of the stack that we're lexing from if /// not expanding a macro and we are lexing directly from source code. /// @@ -1270,6 +1283,11 @@ class Preprocessor { /// false if it is producing tokens to be consumed by Parse and Sema. bool isPreprocessedOutput() const { return PreprocessedOutput; } + /// Set the function used to get dependency directives for a file. + void setDependencyDirectivesFn(DependencyDirectivesFn Fn) { + DependencyDirectivesForFile = std::move(Fn); + } + /// Return true if we are lexing directly from the specified lexer. bool isCurrentLexer(const PreprocessorLexer *L) const { return CurPPLexer == L; diff --git a/clang/include/clang/Lex/PreprocessorOptions.h b/clang/include/clang/Lex/PreprocessorOptions.h index f841e4a028df5..50b5fba0ff773 100644 --- a/clang/include/clang/Lex/PreprocessorOptions.h +++ b/clang/include/clang/Lex/PreprocessorOptions.h @@ -186,41 +186,6 @@ class PreprocessorOptions { /// with support for lifetime-qualified pointers. ObjCXXARCStandardLibraryKind ObjCXXARCStandardLibrary = ARCXX_nolib; - /// Records the set of modules - class FailedModulesSet { - llvm::StringSet<> Failed; - - public: - bool hasAlreadyFailed(StringRef module) { - return Failed.count(module) > 0; - } - - void addFailed(StringRef module) { - Failed.insert(module); - } - }; - - /// The set of modules that failed to build. - /// - /// This pointer will be shared among all of the compiler instances created - /// to (re)build modules, so that once a module fails to build anywhere, - /// other instances will see that the module has failed and won't try to - /// build it again. - std::shared_ptr FailedModules; - - /// Function for getting the dependency preprocessor directives of a file. - /// - /// These are directives derived from a special form of lexing where the - /// source input is scanned for the preprocessor directives that might have an - /// effect on the dependencies for a compilation unit. - /// - /// Enables a client to cache the directives for a file and provide them - /// across multiple compiler invocations. - /// FIXME: Allow returning an error. - std::function>( - FileEntryRef)> - DependencyDirectivesForFile; - /// Set up preprocessor for RunAnalysis action. bool SetUpStaticAnalyzer = false; diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 580bf2a5d79df..bba8ef4ff0173 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -3014,7 +3014,6 @@ class Parser : public CodeCompletionHandler { void DiagnoseAndSkipExtendedMicrosoftTypeAttributes(); SourceLocation SkipExtendedMicrosoftTypeAttributes(); void ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs); - void ParseNullabilityClassAttributes(ParsedAttributes &attrs); void ParseBorlandTypeAttributes(ParsedAttributes &attrs); void ParseOpenCLKernelAttributes(ParsedAttributes &attrs); void ParseOpenCLQualifiers(ParsedAttributes &Attrs); diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index d8d5ab8b61896..70e916b2e2fb4 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -1655,9 +1655,6 @@ class Sema final { /// Add [[gsl::Pointer]] attributes for std:: types. void inferGslPointerAttribute(TypedefNameDecl *TD); - /// Add _Nullable attributes for std:: types. - void inferNullableClassAttribute(CXXRecordDecl *CRD); - enum PragmaOptionsAlignKind { POAK_Native, // #pragma options align=native POAK_Natural, // #pragma options align=natural diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index 04bbc49ab2f31..2cbb86b31b5e2 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1102,9 +1102,13 @@ bool Decl::isInAnotherModuleUnit() const { return M != getASTContext().getCurrentNamedModule(); } +bool Decl::isFromExplicitGlobalModule() const { + return getOwningModule() && getOwningModule()->isExplicitGlobalModule(); +} + bool Decl::shouldSkipCheckingODR() const { - return getASTContext().getLangOpts().SkipODRCheckInGMF && getOwningModule() && - getOwningModule()->isExplicitGlobalModule(); + return getASTContext().getLangOpts().SkipODRCheckInGMF && + isFromExplicitGlobalModule(); } static Decl::Kind getKind(const Decl *D) { return D->getKind(); } diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 5a36621dc5cce..dae8f32fc0295 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12361,12 +12361,17 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, if (!EvaluateInteger(E->getArg(0), Val, Info)) return false; + std::optional Fallback; + if (BuiltinOp == Builtin::BI__builtin_clzg && E->getNumArgs() > 1) { + APSInt FallbackTemp; + if (!EvaluateInteger(E->getArg(1), FallbackTemp, Info)) + return false; + Fallback = FallbackTemp; + } + if (!Val) { - if (BuiltinOp == Builtin::BI__builtin_clzg && E->getNumArgs() > 1) { - if (!EvaluateInteger(E->getArg(1), Val, Info)) - return false; - return Success(Val, E); - } + if (Fallback) + return Success(*Fallback, E); // When the argument is 0, the result of GCC builtins is undefined, // whereas for Microsoft intrinsics, the result is the bit-width of the @@ -12425,12 +12430,17 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, if (!EvaluateInteger(E->getArg(0), Val, Info)) return false; + std::optional Fallback; + if (BuiltinOp == Builtin::BI__builtin_ctzg && E->getNumArgs() > 1) { + APSInt FallbackTemp; + if (!EvaluateInteger(E->getArg(1), FallbackTemp, Info)) + return false; + Fallback = FallbackTemp; + } + if (!Val) { - if (BuiltinOp == Builtin::BI__builtin_ctzg && E->getNumArgs() > 1) { - if (!EvaluateInteger(E->getArg(1), Val, Info)) - return false; - return Success(Val, E); - } + if (Fallback) + return Success(*Fallback, E); return Error(E); } diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.h b/clang/lib/AST/Interp/ByteCodeStmtGen.h index ab7a591fb798e..d7e6e5042c274 100644 --- a/clang/lib/AST/Interp/ByteCodeStmtGen.h +++ b/clang/lib/AST/Interp/ByteCodeStmtGen.h @@ -82,6 +82,7 @@ class ByteCodeStmtGen final : public ByteCodeExprGen { OptLabelTy DefaultLabel; }; +extern template class ByteCodeStmtGen; extern template class ByteCodeExprGen; } // namespace interp diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 47fdbfe21e588..8f3e26d460192 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -4642,15 +4642,16 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const { case Type::Auto: return ResultIfUnknown; - // Dependent template specializations could instantiate to pointer types. + // Dependent template specializations can instantiate to pointer + // types unless they're known to be specializations of a class + // template. case Type::TemplateSpecialization: - // If it's a known class template, we can already check if it's nullable. - if (TemplateDecl *templateDecl = - cast(type.getTypePtr()) - ->getTemplateName() - .getAsTemplateDecl()) - if (auto *CTD = dyn_cast(templateDecl)) - return CTD->getTemplatedDecl()->hasAttr(); + if (TemplateDecl *templateDecl + = cast(type.getTypePtr()) + ->getTemplateName().getAsTemplateDecl()) { + if (isa(templateDecl)) + return false; + } return ResultIfUnknown; case Type::Builtin: @@ -4707,17 +4708,6 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const { } llvm_unreachable("unknown builtin type"); - case Type::Record: { - const RecordDecl *RD = cast(type)->getDecl(); - // For template specializations, look only at primary template attributes. - // This is a consistent regardless of whether the instantiation is known. - if (const auto *CTSD = dyn_cast(RD)) - return CTSD->getSpecializedTemplate() - ->getTemplatedDecl() - ->hasAttr(); - return RD->hasAttr(); - } - // Non-pointer types. case Type::Complex: case Type::LValueReference: @@ -4735,6 +4725,7 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const { case Type::DependentAddressSpace: case Type::FunctionProto: case Type::FunctionNoProto: + case Type::Record: case Type::DeducedTemplateSpecialization: case Type::Enum: case Type::InjectedClassName: diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 32972e749a4d6..3048febdaca0e 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -156,6 +156,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { HasAArch64SVETypes = false; HasRISCVVTypes = false; AllowAMDGPUUnsafeFPAtomics = false; + HasUnalignedAccess = false; ARMCDECoprocMask = 0; // Default to no types using fpret. diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index d3ac8a6f8635f..da00e6e93b56c 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -187,6 +187,8 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple, assert(UseBitFieldTypeAlignment && "bitfields affect type alignment"); UseZeroLengthBitfieldAlignment = true; + HasUnalignedAccess = true; + // AArch64 targets default to using the ARM C++ ABI. TheCXXABI.set(TargetCXXABI::GenericAArch64); @@ -495,7 +497,7 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasPAuthLR) Builder.defineMacro("__ARM_FEATURE_PAUTH_LR", "1"); - if (HasUnaligned) + if (HasUnalignedAccess) Builder.defineMacro("__ARM_FEATURE_UNALIGNED", "1"); if ((FPU & NeonMode) && HasFullFP16) @@ -920,7 +922,8 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, HasSM4 = true; } if (Feature == "+strict-align") - HasUnaligned = false; + HasUnalignedAccess = false; + // All predecessor archs are added but select the latest one for ArchKind. if (Feature == "+v8a" && ArchInfo->Version < llvm::AArch64::ARMV8A.Version) ArchInfo = &llvm::AArch64::ARMV8A; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 542894c66412d..12fb50286f751 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -38,7 +38,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasSHA2 = false; bool HasSHA3 = false; bool HasSM4 = false; - bool HasUnaligned = true; bool HasFullFP16 = false; bool HasDotProd = false; bool HasFP16FML = false; diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index 30cb4b7864b06..83a1b3c66f380 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -509,7 +509,7 @@ bool ARMTargetInfo::handleTargetFeatures(std::vector &Features, SHA2 = 0; AES = 0; DSP = 0; - Unaligned = 1; + HasUnalignedAccess = true; SoftFloat = false; // Note that SoftFloatABI is initialized in our constructor. HWDiv = 0; @@ -576,7 +576,7 @@ bool ARMTargetInfo::handleTargetFeatures(std::vector &Features, return false; } } else if (Feature == "+strict-align") { - Unaligned = 0; + HasUnalignedAccess = false; } else if (Feature == "+fp16") { HW_FP |= HW_FP_HP; } else if (Feature == "+fullfp16") { @@ -783,7 +783,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__ARM_ARCH_PROFILE", "'" + CPUProfile + "'"); // ACLE 6.4.3 Unaligned access supported in hardware - if (Unaligned) + if (HasUnalignedAccess) Builder.defineMacro("__ARM_FEATURE_UNALIGNED", "1"); // ACLE 6.4.4 LDREX/STREX diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h index 71322a094f5ed..e69adbe754739 100644 --- a/clang/lib/Basic/Targets/ARM.h +++ b/clang/lib/Basic/Targets/ARM.h @@ -88,8 +88,6 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo { LLVM_PREFERRED_TYPE(bool) unsigned DSP : 1; LLVM_PREFERRED_TYPE(bool) - unsigned Unaligned : 1; - LLVM_PREFERRED_TYPE(bool) unsigned DotProd : 1; LLVM_PREFERRED_TYPE(bool) unsigned HasMatMul : 1; diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp index 88537989a0512..280bd1d8033cc 100644 --- a/clang/lib/Basic/Targets/LoongArch.cpp +++ b/clang/lib/Basic/Targets/LoongArch.cpp @@ -285,6 +285,8 @@ bool LoongArchTargetInfo::handleTargetFeatures( HasFeatureLSX = true; else if (Feature == "+lasx") HasFeatureLASX = true; + else if (Feature == "-ual") + HasUnalignedAccess = false; } return true; } diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h index 3313102492cb8..68572843f2d74 100644 --- a/clang/lib/Basic/Targets/LoongArch.h +++ b/clang/lib/Basic/Targets/LoongArch.h @@ -132,6 +132,7 @@ class LLVM_LIBRARY_VISIBILITY LoongArch64TargetInfo : LoongArchTargetInfo(Triple, Opts) { LongWidth = LongAlign = PointerWidth = PointerAlign = 64; IntMaxType = Int64Type = SignedLong; + HasUnalignedAccess = true; resetDataLayout("e-m:e-p:64:64-i64:64-i128:128-n64-S128"); // TODO: select appropriate ABI. setABI("lp64d"); diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h index 23d4e1b598fa1..c9dcf434c93b0 100644 --- a/clang/lib/Basic/Targets/Mips.h +++ b/clang/lib/Basic/Targets/Mips.h @@ -328,6 +328,8 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo { IsMips16 = true; else if (Feature == "+micromips") IsMicromips = true; + else if (Feature == "+mips32r6" || Feature == "+mips64r6") + HasUnalignedAccess = true; else if (Feature == "+dsp") DspRev = std::max(DspRev, DSP1); else if (Feature == "+dspr2") diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h index 70683916a8b04..fa2f442e25846 100644 --- a/clang/lib/Basic/Targets/PPC.h +++ b/clang/lib/Basic/Targets/PPC.h @@ -92,6 +92,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo { LongDoubleFormat = &llvm::APFloat::PPCDoubleDouble(); HasStrictFP = true; HasIbm128 = true; + HasUnalignedAccess = true; } // Set the language option for altivec based on our value. diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h index 3e08b27972fa3..8e302acd51b8a 100644 --- a/clang/lib/Basic/Targets/SystemZ.h +++ b/clang/lib/Basic/Targets/SystemZ.h @@ -47,6 +47,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo { LongDoubleFormat = &llvm::APFloat::IEEEquad(); DefaultAlignForAttributeAligned = 64; MinGlobalAlign = 16; + HasUnalignedAccess = true; if (Triple.isOSzOS()) { TLSSupported = false; // All vector types are default aligned on an 8-byte boundary, even if the diff --git a/clang/lib/Basic/Targets/VE.h b/clang/lib/Basic/Targets/VE.h index ea9a092cad809..7e8fdf6096ef2 100644 --- a/clang/lib/Basic/Targets/VE.h +++ b/clang/lib/Basic/Targets/VE.h @@ -40,6 +40,7 @@ class LLVM_LIBRARY_VISIBILITY VETargetInfo : public TargetInfo { Int64Type = SignedLong; RegParmMax = 8; MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64; + HasUnalignedAccess = true; WCharType = UnsignedInt; WIntType = UnsignedInt; diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h index 83b1711f9fdf6..5568aa28eaefa 100644 --- a/clang/lib/Basic/Targets/WebAssembly.h +++ b/clang/lib/Basic/Targets/WebAssembly.h @@ -84,6 +84,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo { SizeType = UnsignedLong; PtrDiffType = SignedLong; IntPtrType = SignedLong; + HasUnalignedAccess = true; } StringRef getABI() const override; diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index d2232c7d5275a..c14e4d5f433d8 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -188,6 +188,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { LongDoubleFormat = &llvm::APFloat::x87DoubleExtended(); AddrSpaceMap = &X86AddrSpaceMap; HasStrictFP = true; + HasUnalignedAccess = true; bool IsWinCOFF = getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF(); diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 187f1e8cdaf92..80c4667f43687 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -4393,8 +4393,7 @@ void CodeGenFunction::EmitNonNullArgCheck(RValue RV, QualType ArgType, NNAttr = getNonNullAttr(AC.getDecl(), PVD, ArgType, ArgNo); bool CanCheckNullability = false; - if (SanOpts.has(SanitizerKind::NullabilityArg) && !NNAttr && PVD && - !PVD->getType()->isRecordType()) { + if (SanOpts.has(SanitizerKind::NullabilityArg) && !NNAttr && PVD) { auto Nullability = PVD->getType()->getNullability(); CanCheckNullability = Nullability && *Nullability == NullabilityKind::NonNull && diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp index 7822903b89ce4..e32023aeac1e6 100644 --- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp +++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp @@ -47,8 +47,10 @@ namespace { /// [i8 x 3] instead of i24. The function clipTailPadding does this. /// C++ examples that require clipping: /// struct { int a : 24; char b; }; // a must be clipped, b goes at offset 3 -/// struct A { int a : 24; }; // a must be clipped because a struct like B -// could exist: struct B : A { char b; }; // b goes at offset 3 +/// struct A { int a : 24; ~A(); }; // a must be clipped because: +/// struct B : A { char b; }; // b goes at offset 3 +/// * The allocation of bitfield access units is described in more detail in +/// CGRecordLowering::accumulateBitFields. /// * Clang ignores 0 sized bitfields and 0 sized bases but *not* zero sized /// fields. The existing asserts suggest that LLVM assumes that *every* field /// has an underlying storage type. Therefore empty structures containing @@ -184,8 +186,9 @@ struct CGRecordLowering { void lower(bool NonVirtualBaseType); void lowerUnion(bool isNoUniqueAddress); void accumulateFields(); - void accumulateBitFields(RecordDecl::field_iterator Field, - RecordDecl::field_iterator FieldEnd); + RecordDecl::field_iterator + accumulateBitFields(RecordDecl::field_iterator Field, + RecordDecl::field_iterator FieldEnd); void computeVolatileBitfields(); void accumulateBases(); void accumulateVPtrs(); @@ -378,13 +381,15 @@ void CGRecordLowering::lowerUnion(bool isNoUniqueAddress) { void CGRecordLowering::accumulateFields() { for (RecordDecl::field_iterator Field = D->field_begin(), FieldEnd = D->field_end(); - Field != FieldEnd;) { + Field != FieldEnd;) { if (Field->isBitField()) { - RecordDecl::field_iterator Start = Field; - // Iterate to gather the list of bitfields. - for (++Field; Field != FieldEnd && Field->isBitField(); ++Field); - accumulateBitFields(Start, Field); - } else if (!Field->isZeroSize(Context)) { + Field = accumulateBitFields(Field, FieldEnd); + assert((Field == FieldEnd || !Field->isBitField()) && + "Failed to accumulate all the bitfields"); + } else if (Field->isZeroSize(Context)) { + // Empty fields have no storage. + ++Field; + } else { // Use base subobject layout for the potentially-overlapping field, // as it is done in RecordLayoutBuilder Members.push_back(MemberInfo( @@ -394,33 +399,33 @@ void CGRecordLowering::accumulateFields() { : getStorageType(*Field), *Field)); ++Field; - } else { - ++Field; } } } -void +// Create members for bitfields. Field is a bitfield, and FieldEnd is the end +// iterator of the record. Return the first non-bitfield encountered. +RecordDecl::field_iterator CGRecordLowering::accumulateBitFields(RecordDecl::field_iterator Field, RecordDecl::field_iterator FieldEnd) { - // Run stores the first element of the current run of bitfields. FieldEnd is - // used as a special value to note that we don't have a current run. A - // bitfield run is a contiguous collection of bitfields that can be stored in - // the same storage block. Zero-sized bitfields and bitfields that would - // cross an alignment boundary break a run and start a new one. - RecordDecl::field_iterator Run = FieldEnd; - // Tail is the offset of the first bit off the end of the current run. It's - // used to determine if the ASTRecordLayout is treating these two bitfields as - // contiguous. StartBitOffset is offset of the beginning of the Run. - uint64_t StartBitOffset, Tail = 0; if (isDiscreteBitFieldABI()) { - for (; Field != FieldEnd; ++Field) { - uint64_t BitOffset = getFieldBitOffset(*Field); + // Run stores the first element of the current run of bitfields. FieldEnd is + // used as a special value to note that we don't have a current run. A + // bitfield run is a contiguous collection of bitfields that can be stored + // in the same storage block. Zero-sized bitfields and bitfields that would + // cross an alignment boundary break a run and start a new one. + RecordDecl::field_iterator Run = FieldEnd; + // Tail is the offset of the first bit off the end of the current run. It's + // used to determine if the ASTRecordLayout is treating these two bitfields + // as contiguous. StartBitOffset is offset of the beginning of the Run. + uint64_t StartBitOffset, Tail = 0; + for (; Field != FieldEnd && Field->isBitField(); ++Field) { // Zero-width bitfields end runs. if (Field->isZeroLengthBitField(Context)) { Run = FieldEnd; continue; } + uint64_t BitOffset = getFieldBitOffset(*Field); llvm::Type *Type = Types.ConvertTypeForMem(Field->getType(), /*ForBitField=*/true); // If we don't have a run yet, or don't live within the previous run's @@ -439,82 +444,248 @@ CGRecordLowering::accumulateBitFields(RecordDecl::field_iterator Field, Members.push_back(MemberInfo(bitsToCharUnits(StartBitOffset), MemberInfo::Field, nullptr, *Field)); } - return; + return Field; } - // Check if OffsetInRecord (the size in bits of the current run) is better - // as a single field run. When OffsetInRecord has legal integer width, and - // its bitfield offset is naturally aligned, it is better to make the - // bitfield a separate storage component so as it can be accessed directly - // with lower cost. - auto IsBetterAsSingleFieldRun = [&](uint64_t OffsetInRecord, - uint64_t StartBitOffset) { - if (!Types.getCodeGenOpts().FineGrainedBitfieldAccesses) - return false; - if (OffsetInRecord < 8 || !llvm::isPowerOf2_64(OffsetInRecord) || - !DataLayout.fitsInLegalInteger(OffsetInRecord)) - return false; - // Make sure StartBitOffset is naturally aligned if it is treated as an - // IType integer. - if (StartBitOffset % - Context.toBits(getAlignment(getIntNType(OffsetInRecord))) != - 0) - return false; - return true; - }; + // The SysV ABI can overlap bitfield storage units with both other bitfield + // storage units /and/ other non-bitfield data members. Accessing a sequence + // of bitfields mustn't interfere with adjacent non-bitfields -- they're + // permitted to be accessed in separate threads for instance. + + // We split runs of bit-fields into a sequence of "access units". When we emit + // a load or store of a bit-field, we'll load/store the entire containing + // access unit. As mentioned, the standard requires that these loads and + // stores must not interfere with accesses to other memory locations, and it + // defines the bit-field's memory location as the current run of + // non-zero-width bit-fields. So an access unit must never overlap with + // non-bit-field storage or cross a zero-width bit-field. Otherwise, we're + // free to draw the lines as we see fit. + + // Drawing these lines well can be complicated. LLVM generally can't modify a + // program to access memory that it didn't before, so using very narrow access + // units can prevent the compiler from using optimal access patterns. For + // example, suppose a run of bit-fields occupies four bytes in a struct. If we + // split that into four 1-byte access units, then a sequence of assignments + // that doesn't touch all four bytes may have to be emitted with multiple + // 8-bit stores instead of a single 32-bit store. On the other hand, if we use + // very wide access units, we may find ourselves emitting accesses to + // bit-fields we didn't really need to touch, just because LLVM was unable to + // clean up after us. + + // It is desirable to have access units be aligned powers of 2 no larger than + // a register. (On non-strict alignment ISAs, the alignment requirement can be + // dropped.) A three byte access unit will be accessed using 2-byte and 1-byte + // accesses and bit manipulation. If no bitfield straddles across the two + // separate accesses, it is better to have separate 2-byte and 1-byte access + // units, as then LLVM will not generate unnecessary memory accesses, or bit + // manipulation. Similarly, on a strict-alignment architecture, it is better + // to keep access-units naturally aligned, to avoid similar bit + // manipulation synthesizing larger unaligned accesses. + + // Bitfields that share parts of a single byte are, of necessity, placed in + // the same access unit. That unit will encompass a consecutive run where + // adjacent bitfields share parts of a byte. (The first bitfield of such an + // access unit will start at the beginning of a byte.) + + // We then try and accumulate adjacent access units when the combined unit is + // naturally sized, no larger than a register, and (on a strict alignment + // ISA), naturally aligned. Note that this requires lookahead to one or more + // subsequent access units. For instance, consider a 2-byte access-unit + // followed by 2 1-byte units. We can merge that into a 4-byte access-unit, + // but we would not want to merge a 2-byte followed by a single 1-byte (and no + // available tail padding). We keep track of the best access unit seen so far, + // and use that when we determine we cannot accumulate any more. Then we start + // again at the bitfield following that best one. + + // The accumulation is also prevented when: + // *) it would cross a character-aigned zero-width bitfield, or + // *) fine-grained bitfield access option is in effect. + + CharUnits RegSize = + bitsToCharUnits(Context.getTargetInfo().getRegisterWidth()); + unsigned CharBits = Context.getCharWidth(); + + // Data about the start of the span we're accumulating to create an access + // unit from. Begin is the first bitfield of the span. If Begin is FieldEnd, + // we've not got a current span. The span starts at the BeginOffset character + // boundary. BitSizeSinceBegin is the size (in bits) of the span -- this might + // include padding when we've advanced to a subsequent bitfield run. + RecordDecl::field_iterator Begin = FieldEnd; + CharUnits BeginOffset; + uint64_t BitSizeSinceBegin; + + // The (non-inclusive) end of the largest acceptable access unit we've found + // since Begin. If this is Begin, we're gathering the initial set of bitfields + // of a new span. BestEndOffset is the end of that acceptable access unit -- + // it might extend beyond the last character of the bitfield run, using + // available padding characters. + RecordDecl::field_iterator BestEnd = Begin; + CharUnits BestEndOffset; - // The start field is better as a single field run. - bool StartFieldAsSingleRun = false; for (;;) { - // Check to see if we need to start a new run. - if (Run == FieldEnd) { - // If we're out of fields, return. - if (Field == FieldEnd) + // AtAlignedBoundary is true iff Field is the (potential) start of a new + // span (or the end of the bitfields). When true, LimitOffset is the + // character offset of that span and Barrier indicates whether the new + // span cannot be merged into the current one. + bool AtAlignedBoundary = false; + bool Barrier = false; + + if (Field != FieldEnd && Field->isBitField()) { + uint64_t BitOffset = getFieldBitOffset(*Field); + if (Begin == FieldEnd) { + // Beginning a new span. + Begin = Field; + BestEnd = Begin; + + assert((BitOffset % CharBits) == 0 && "Not at start of char"); + BeginOffset = bitsToCharUnits(BitOffset); + BitSizeSinceBegin = 0; + } else if ((BitOffset % CharBits) != 0) { + // Bitfield occupies the same character as previous bitfield, it must be + // part of the same span. This can include zero-length bitfields, should + // the target not align them to character boundaries. Such non-alignment + // is at variance with the standards, which require zero-length + // bitfields be a barrier between access units. But of course we can't + // achieve that in the middle of a character. + assert(BitOffset == Context.toBits(BeginOffset) + BitSizeSinceBegin && + "Concatenating non-contiguous bitfields"); + } else { + // Bitfield potentially begins a new span. This includes zero-length + // bitfields on non-aligning targets that lie at character boundaries + // (those are barriers to merging). + if (Field->isZeroLengthBitField(Context)) + Barrier = true; + AtAlignedBoundary = true; + } + } else { + // We've reached the end of the bitfield run. Either we're done, or this + // is a barrier for the current span. + if (Begin == FieldEnd) break; - // Any non-zero-length bitfield can start a new run. - if (!Field->isZeroLengthBitField(Context)) { - Run = Field; - StartBitOffset = getFieldBitOffset(*Field); - Tail = StartBitOffset + Field->getBitWidthValue(Context); - StartFieldAsSingleRun = IsBetterAsSingleFieldRun(Tail - StartBitOffset, - StartBitOffset); + + Barrier = true; + AtAlignedBoundary = true; + } + + // InstallBest indicates whether we should create an access unit for the + // current best span: fields [Begin, BestEnd) occupying characters + // [BeginOffset, BestEndOffset). + bool InstallBest = false; + if (AtAlignedBoundary) { + // Field is the start of a new span or the end of the bitfields. The + // just-seen span now extends to BitSizeSinceBegin. + + // Determine if we can accumulate that just-seen span into the current + // accumulation. + CharUnits AccessSize = bitsToCharUnits(BitSizeSinceBegin + CharBits - 1); + if (BestEnd == Begin) { + // This is the initial run at the start of a new span. By definition, + // this is the best seen so far. + BestEnd = Field; + BestEndOffset = BeginOffset + AccessSize; + if (Types.getCodeGenOpts().FineGrainedBitfieldAccesses) + // Fine-grained access, so no merging of spans. + InstallBest = true; + else if (!BitSizeSinceBegin) + // A zero-sized initial span -- this will install nothing and reset + // for another. + InstallBest = true; + } else if (AccessSize > RegSize) + // Accumulating the just-seen span would create a multi-register access + // unit, which would increase register pressure. + InstallBest = true; + + if (!InstallBest) { + // Determine if accumulating the just-seen span will create an expensive + // access unit or not. + llvm::Type *Type = getIntNType(Context.toBits(AccessSize)); + if (!Context.getTargetInfo().hasCheapUnalignedBitFieldAccess()) { + // Unaligned accesses are expensive. Only accumulate if the new unit + // is naturally aligned. Otherwise install the best we have, which is + // either the initial access unit (can't do better), or a naturally + // aligned accumulation (since we would have already installed it if + // it wasn't naturally aligned). + CharUnits Align = getAlignment(Type); + if (Align > Layout.getAlignment()) + // The alignment required is greater than the containing structure + // itself. + InstallBest = true; + else if (!BeginOffset.isMultipleOf(Align)) + // The access unit is not at a naturally aligned offset within the + // structure. + InstallBest = true; + } + + if (!InstallBest) { + // Find the next used storage offset to determine what the limit of + // the current span is. That's either the offset of the next field + // with storage (which might be Field itself) or the end of the + // non-reusable tail padding. + CharUnits LimitOffset; + for (auto Probe = Field; Probe != FieldEnd; ++Probe) + if (!Probe->isZeroSize(Context)) { + // A member with storage sets the limit. + assert((getFieldBitOffset(*Probe) % CharBits) == 0 && + "Next storage is not byte-aligned"); + LimitOffset = bitsToCharUnits(getFieldBitOffset(*Probe)); + goto FoundLimit; + } + // We reached the end of the fields. We can't necessarily use tail + // padding in C++ structs, so the NonVirtual size is what we must + // use there. + LimitOffset = RD ? Layout.getNonVirtualSize() : Layout.getDataSize(); + FoundLimit:; + + CharUnits TypeSize = getSize(Type); + if (BeginOffset + TypeSize <= LimitOffset) { + // There is space before LimitOffset to create a naturally-sized + // access unit. + BestEndOffset = BeginOffset + TypeSize; + BestEnd = Field; + } + + if (Barrier) + // The next field is a barrier that we cannot merge across. + InstallBest = true; + else + // Otherwise, we're not installing. Update the bit size + // of the current span to go all the way to LimitOffset, which is + // the (aligned) offset of next bitfield to consider. + BitSizeSinceBegin = Context.toBits(LimitOffset - BeginOffset); + } } - ++Field; - continue; } - // If the start field of a new run is better as a single run, or - // if current field (or consecutive fields) is better as a single run, or - // if current field has zero width bitfield and either - // UseZeroLengthBitfieldAlignment or UseBitFieldTypeAlignment is set to - // true, or - // if the offset of current field is inconsistent with the offset of - // previous field plus its offset, - // skip the block below and go ahead to emit the storage. - // Otherwise, try to add bitfields to the run. - if (!StartFieldAsSingleRun && Field != FieldEnd && - !IsBetterAsSingleFieldRun(Tail - StartBitOffset, StartBitOffset) && - (!Field->isZeroLengthBitField(Context) || - (!Context.getTargetInfo().useZeroLengthBitfieldAlignment() && - !Context.getTargetInfo().useBitFieldTypeAlignment())) && - Tail == getFieldBitOffset(*Field)) { - Tail += Field->getBitWidthValue(Context); + if (InstallBest) { + assert((Field == FieldEnd || !Field->isBitField() || + (getFieldBitOffset(*Field) % CharBits) == 0) && + "Installing but not at an aligned bitfield or limit"); + CharUnits AccessSize = BestEndOffset - BeginOffset; + if (!AccessSize.isZero()) { + // Add the storage member for the access unit to the record. The + // bitfields get the offset of their storage but come afterward and + // remain there after a stable sort. + llvm::Type *Type = getIntNType(Context.toBits(AccessSize)); + Members.push_back(StorageInfo(BeginOffset, Type)); + for (; Begin != BestEnd; ++Begin) + if (!Begin->isZeroLengthBitField(Context)) + Members.push_back( + MemberInfo(BeginOffset, MemberInfo::Field, nullptr, *Begin)); + } + // Reset to start a new span. + Field = BestEnd; + Begin = FieldEnd; + } else { + assert(Field != FieldEnd && Field->isBitField() && + "Accumulating past end of bitfields"); + assert(!Barrier && "Accumulating across barrier"); + // Accumulate this bitfield into the current (potential) span. + BitSizeSinceBegin += Field->getBitWidthValue(Context); ++Field; - continue; } - - // We've hit a break-point in the run and need to emit a storage field. - llvm::Type *Type = getIntNType(Tail - StartBitOffset); - // Add the storage member to the record and set the bitfield info for all of - // the bitfields in the run. Bitfields get the offset of their storage but - // come afterward and remain there after a stable sort. - Members.push_back(StorageInfo(bitsToCharUnits(StartBitOffset), Type)); - for (; Run != Field; ++Run) - Members.push_back(MemberInfo(bitsToCharUnits(StartBitOffset), - MemberInfo::Field, nullptr, *Run)); - Run = FieldEnd; - StartFieldAsSingleRun = false; } + + return Field; } void CGRecordLowering::accumulateBases() { diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index fa3f297245897..44103884940fd 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -989,8 +989,7 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, // return value. Initialize the flag to 'true' and refine it in EmitParmDecl. if (SanOpts.has(SanitizerKind::NullabilityReturn)) { auto Nullability = FnRetTy->getNullability(); - if (Nullability && *Nullability == NullabilityKind::NonNull && - !FnRetTy->isRecordType()) { + if (Nullability && *Nullability == NullabilityKind::NonNull) { if (!(SanOpts.has(SanitizerKind::ReturnsNonnullAttribute) && CurCodeDecl && CurCodeDecl->getAttr())) RetValNullabilityPrecondition = diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 79ebb0ae0ee98..6e3baf8386441 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -1206,16 +1206,6 @@ compileModuleImpl(CompilerInstance &ImportingInstance, SourceLocation ImportLoc, // Note the name of the module we're building. Invocation->getLangOpts().CurrentModule = std::string(ModuleName); - // Make sure that the failed-module structure has been allocated in - // the importing instance, and propagate the pointer to the newly-created - // instance. - PreprocessorOptions &ImportingPPOpts - = ImportingInstance.getInvocation().getPreprocessorOpts(); - if (!ImportingPPOpts.FailedModules) - ImportingPPOpts.FailedModules = - std::make_shared(); - PPOpts.FailedModules = ImportingPPOpts.FailedModules; - // If there is a module map file, build the module using the module map. // Set up the inputs/outputs so that we build the module from its umbrella // header. @@ -1269,6 +1259,13 @@ compileModuleImpl(CompilerInstance &ImportingInstance, SourceLocation ImportLoc, SourceMgr.pushModuleBuildStack(ModuleName, FullSourceLoc(ImportLoc, ImportingInstance.getSourceManager())); + // Make sure that the failed-module structure has been allocated in + // the importing instance, and propagate the pointer to the newly-created + // instance. + if (!ImportingInstance.hasFailedModulesSet()) + ImportingInstance.createFailedModulesSet(); + Instance.setFailedModulesSet(ImportingInstance.getFailedModulesSetPtr()); + // If we're collecting module dependencies, we need to share a collector // between all of the module CompilerInstances. Other than that, we don't // want to produce any dependency output from the module build. @@ -1992,10 +1989,8 @@ ModuleLoadResult CompilerInstance::findOrCompileModuleAndReadAST( return nullptr; } - // Check whether we have already attempted to build this module (but - // failed). - if (getPreprocessorOpts().FailedModules && - getPreprocessorOpts().FailedModules->hasAlreadyFailed(ModuleName)) { + // Check whether we have already attempted to build this module (but failed). + if (FailedModules && FailedModules->hasAlreadyFailed(ModuleName)) { getDiagnostics().Report(ModuleNameLoc, diag::err_module_not_built) << ModuleName << SourceRange(ImportLoc, ModuleNameLoc); return nullptr; @@ -2006,8 +2001,8 @@ ModuleLoadResult CompilerInstance::findOrCompileModuleAndReadAST( ModuleFilename)) { assert(getDiagnostics().hasErrorOccurred() && "undiagnosed error in compileModuleAndReadAST"); - if (getPreprocessorOpts().FailedModules) - getPreprocessorOpts().FailedModules->addFailed(ModuleName); + if (FailedModules) + FailedModules->addFailed(ModuleName); return nullptr; } diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp index 3fd1cdd3b4794..0bc26b694cfc8 100644 --- a/clang/lib/Frontend/FrontendActions.cpp +++ b/clang/lib/Frontend/FrontendActions.cpp @@ -69,7 +69,10 @@ void InitOnlyAction::ExecuteAction() { // Basically PreprocessOnlyAction::ExecuteAction. void ReadPCHAndPreprocessAction::ExecuteAction() { - Preprocessor &PP = getCompilerInstance().getPreprocessor(); + CompilerInstance &CI = getCompilerInstance(); + AdjustCI(CI); + + Preprocessor &PP = CI.getPreprocessor(); // Ignore unknown pragmas. PP.IgnorePragmas(); @@ -1188,6 +1191,8 @@ void PrintDependencyDirectivesSourceMinimizerAction::ExecuteAction() { void GetDependenciesByModuleNameAction::ExecuteAction() { CompilerInstance &CI = getCompilerInstance(); + AdjustCI(CI); + Preprocessor &PP = CI.getPreprocessor(); SourceManager &SM = PP.getSourceManager(); FileID MainFileID = SM.getMainFileID(); diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index a34e72402c0e6..9fb6204f90c9a 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -1248,25 +1248,25 @@ float4 rsqrt(float4); /// rounded to the nearest even value. _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_elementwise_round) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_roundeven) half round(half); _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_elementwise_round) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_roundeven) half2 round(half2); _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_elementwise_round) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_roundeven) half3 round(half3); _HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) -_HLSL_BUILTIN_ALIAS(__builtin_elementwise_round) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_roundeven) half4 round(half4); -_HLSL_BUILTIN_ALIAS(__builtin_elementwise_round) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_roundeven) float round(float); -_HLSL_BUILTIN_ALIAS(__builtin_elementwise_round) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_roundeven) float2 round(float2); -_HLSL_BUILTIN_ALIAS(__builtin_elementwise_round) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_roundeven) float3 round(float3); -_HLSL_BUILTIN_ALIAS(__builtin_elementwise_round) +_HLSL_BUILTIN_ALIAS(__builtin_elementwise_roundeven) float4 round(float4); //===----------------------------------------------------------------------===// diff --git a/clang/lib/InstallAPI/CMakeLists.txt b/clang/lib/InstallAPI/CMakeLists.txt index 894db699578f2..e0bc8d969ecb3 100644 --- a/clang/lib/InstallAPI/CMakeLists.txt +++ b/clang/lib/InstallAPI/CMakeLists.txt @@ -1,6 +1,7 @@ set(LLVM_LINK_COMPONENTS Support TextAPI + TextAPIBinaryReader Demangle Core ) diff --git a/clang/lib/InstallAPI/DylibVerifier.cpp b/clang/lib/InstallAPI/DylibVerifier.cpp index ba25e4183a9b8..c0eda1d81b9b9 100644 --- a/clang/lib/InstallAPI/DylibVerifier.cpp +++ b/clang/lib/InstallAPI/DylibVerifier.cpp @@ -10,6 +10,7 @@ #include "clang/InstallAPI/FrontendRecords.h" #include "clang/InstallAPI/InstallAPIDiagnostic.h" #include "llvm/Demangle/Demangle.h" +#include "llvm/TextAPI/DylibReader.h" using namespace llvm::MachO; @@ -35,6 +36,14 @@ struct DylibVerifier::SymbolContext { bool Inlined = false; }; +struct DylibVerifier::DWARFContext { + // Track whether DSYM parsing has already been attempted to avoid re-parsing. + bool ParsedDSYM{false}; + + // Lookup table for source locations by symbol name. + DylibReader::SymbolToSourceLocMap SourceLocs{}; +}; + static bool isCppMangled(StringRef Name) { // InstallAPI currently only supports itanium manglings. return (Name.starts_with("_Z") || Name.starts_with("__Z") || @@ -511,14 +520,16 @@ DylibVerifier::Result DylibVerifier::verify(GlobalRecord *R, return verifyImpl(R, SymCtx); } -void DylibVerifier::VerifierContext::emitDiag( - llvm::function_ref Report) { +void DylibVerifier::VerifierContext::emitDiag(llvm::function_ref Report, + RecordLoc *Loc) { if (!DiscoveredFirstError) { Diag->Report(diag::warn_target) << (PrintArch ? getArchitectureName(Target.Arch) : getTargetTripleName(Target)); DiscoveredFirstError = true; } + if (Loc && Loc->isValid()) + llvm::errs() << Loc->File << ":" << Loc->Line << ":" << 0 << ": "; Report(); } @@ -561,26 +572,36 @@ void DylibVerifier::visitSymbolInDylib(const Record &R, SymbolContext &SymCtx) { return; } - // All checks at this point classify as some kind of violation that should be - // reported. + const bool IsLinkerSymbol = SymbolName.starts_with("$ld$"); + + // All checks at this point classify as some kind of violation. + // The different verification modes dictate whether they are reported to the + // user. + if (IsLinkerSymbol || (Mode > VerificationMode::ErrorsOnly)) + accumulateSrcLocForDylibSymbols(); + RecordLoc Loc = DWARFCtx->SourceLocs.lookup(SymCtx.SymbolName); // Regardless of verification mode, error out on mismatched special linker // symbols. - if (SymbolName.starts_with("$ld$")) { - Ctx.emitDiag([&]() { - Ctx.Diag->Report(diag::err_header_symbol_missing) - << getAnnotatedName(&R, SymCtx, /*ValidSourceLoc=*/false); - }); + if (IsLinkerSymbol) { + Ctx.emitDiag( + [&]() { + Ctx.Diag->Report(diag::err_header_symbol_missing) + << getAnnotatedName(&R, SymCtx, Loc.isValid()); + }, + &Loc); updateState(Result::Invalid); return; } // Missing declarations for exported symbols are hard errors on Pedantic mode. if (Mode == VerificationMode::Pedantic) { - Ctx.emitDiag([&]() { - Ctx.Diag->Report(diag::err_header_symbol_missing) - << getAnnotatedName(&R, SymCtx, /*ValidSourceLoc=*/false); - }); + Ctx.emitDiag( + [&]() { + Ctx.Diag->Report(diag::err_header_symbol_missing) + << getAnnotatedName(&R, SymCtx, Loc.isValid()); + }, + &Loc); updateState(Result::Invalid); return; } @@ -588,10 +609,12 @@ void DylibVerifier::visitSymbolInDylib(const Record &R, SymbolContext &SymCtx) { // Missing declarations for exported symbols are warnings on ErrorsAndWarnings // mode. if (Mode == VerificationMode::ErrorsAndWarnings) { - Ctx.emitDiag([&]() { - Ctx.Diag->Report(diag::warn_header_symbol_missing) - << getAnnotatedName(&R, SymCtx, /*ValidSourceLoc=*/false); - }); + Ctx.emitDiag( + [&]() { + Ctx.Diag->Report(diag::warn_header_symbol_missing) + << getAnnotatedName(&R, SymCtx, Loc.isValid()); + }, + &Loc); updateState(Result::Ignore); return; } @@ -622,6 +645,18 @@ void DylibVerifier::visitObjCIVar(const ObjCIVarRecord &R, visitSymbolInDylib(R, SymCtx); } +void DylibVerifier::accumulateSrcLocForDylibSymbols() { + if (DSYMPath.empty()) + return; + + assert(DWARFCtx != nullptr && "Expected an initialized DWARFContext"); + if (DWARFCtx->ParsedDSYM) + return; + DWARFCtx->ParsedDSYM = true; + DWARFCtx->SourceLocs = + DylibReader::accumulateSourceLocFromDSYM(DSYMPath, Ctx.Target); +} + void DylibVerifier::visitObjCInterface(const ObjCInterfaceRecord &R) { if (R.isVerified()) return; @@ -655,6 +690,8 @@ DylibVerifier::Result DylibVerifier::verifyRemainingSymbols() { return Result::NoVerify; assert(!Dylib.empty() && "No binary to verify against"); + DWARFContext DWARFInfo; + DWARFCtx = &DWARFInfo; Ctx.DiscoveredFirstError = false; Ctx.PrintArch = true; for (std::shared_ptr Slice : Dylib) { diff --git a/clang/lib/Interpreter/Value.cpp b/clang/lib/Interpreter/Value.cpp index 1d6b2da087e9f..eb2ce9c9fd330 100644 --- a/clang/lib/Interpreter/Value.cpp +++ b/clang/lib/Interpreter/Value.cpp @@ -1,4 +1,4 @@ -//===--- Interpreter.h - Incremental Compiation and Execution---*- C++ -*-===// +//===------------ Value.cpp - Definition of interpreter value -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -22,8 +22,6 @@ #include #include -using namespace clang; - namespace { // This is internal buffer maintained by Value, used to hold temporaries. @@ -61,7 +59,7 @@ class ValueStorage { void Release() { assert(RefCnt > 0 && "Can't release if reference count is already zero"); if (--RefCnt == 0) { - // We hace a non-trivial dtor. + // We have a non-trivial dtor. if (Dtor && IsAlive()) { assert(Elements && "We at least should have 1 element in Value"); size_t Stride = AllocSize / Elements; @@ -97,6 +95,8 @@ class ValueStorage { }; } // namespace +namespace clang { + static Value::Kind ConvertQualTypeToKind(const ASTContext &Ctx, QualType QT) { if (Ctx.hasSameType(QT, Ctx.VoidTy)) return Value::K_Void; @@ -265,3 +265,5 @@ void Value::print(llvm::raw_ostream &Out) const { assert(OpaqueType != nullptr && "Can't print default Value"); Out << "Not implement yet.\n"; } + +} // namespace clang diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp index 3b1b6df1dbae4..a0cc2b516574c 100644 --- a/clang/lib/Lex/PPLexerChange.cpp +++ b/clang/lib/Lex/PPLexerChange.cpp @@ -93,16 +93,10 @@ bool Preprocessor::EnterSourceFile(FileID FID, ConstSearchDirIterator CurDir, } Lexer *TheLexer = new Lexer(FID, *InputFile, *this, IsFirstIncludeOfFile); - if (getPreprocessorOpts().DependencyDirectivesForFile && - FID != PredefinesFileID) { - if (OptionalFileEntryRef File = SourceMgr.getFileEntryRefForID(FID)) { - if (std::optional> - DepDirectives = - getPreprocessorOpts().DependencyDirectivesForFile(*File)) { + if (DependencyDirectivesForFile && FID != PredefinesFileID) + if (OptionalFileEntryRef File = SourceMgr.getFileEntryRefForID(FID)) + if (auto DepDirectives = DependencyDirectivesForFile(*File)) TheLexer->DepDirectives = *DepDirectives; - } - } - } EnterSourceFileWithLexer(TheLexer, CurDir); return false; diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 861a25dc5103c..63fe678cbb29e 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -1502,15 +1502,6 @@ void Parser::ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs) { } } -void Parser::ParseNullabilityClassAttributes(ParsedAttributes &attrs) { - while (Tok.is(tok::kw__Nullable)) { - IdentifierInfo *AttrName = Tok.getIdentifierInfo(); - auto Kind = Tok.getKind(); - SourceLocation AttrNameLoc = ConsumeToken(); - attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind); - } -} - /// Determine whether the following tokens are valid after a type-specifier /// which could be a standalone declaration. This will conservatively return /// true if there's any doubt, and is appropriate for insert-';' fixits. @@ -1692,21 +1683,15 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind, ParsedAttributes attrs(AttrFactory); // If attributes exist after tag, parse them. - for (;;) { - MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs); - // Parse inheritance specifiers. - if (Tok.isOneOf(tok::kw___single_inheritance, - tok::kw___multiple_inheritance, - tok::kw___virtual_inheritance)) { - ParseMicrosoftInheritanceClassAttributes(attrs); - continue; - } - if (Tok.is(tok::kw__Nullable)) { - ParseNullabilityClassAttributes(attrs); - continue; - } - break; - } + MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs); + + // Parse inheritance specifiers. + if (Tok.isOneOf(tok::kw___single_inheritance, tok::kw___multiple_inheritance, + tok::kw___virtual_inheritance)) + ParseMicrosoftInheritanceClassAttributes(attrs); + + // Allow attributes to precede or succeed the inheritance specifiers. + MaybeParseAttributes(PAKM_CXX11 | PAKM_Declspec | PAKM_GNU, attrs); // Source location used by FIXIT to insert misplaced // C++11 attributes diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp index a5dd158808f26..0dcf42e489971 100644 --- a/clang/lib/Sema/SemaAttr.cpp +++ b/clang/lib/Sema/SemaAttr.cpp @@ -215,18 +215,6 @@ void Sema::inferGslOwnerPointerAttribute(CXXRecordDecl *Record) { inferGslPointerAttribute(Record, Record); } -void Sema::inferNullableClassAttribute(CXXRecordDecl *CRD) { - static llvm::StringSet<> Nullable{ - "auto_ptr", "shared_ptr", "unique_ptr", "exception_ptr", - "coroutine_handle", "function", "move_only_function", - }; - - if (CRD->isInStdNamespace() && Nullable.count(CRD->getName()) && - !CRD->hasAttr()) - for (Decl *Redecl : CRD->redecls()) - Redecl->addAttr(TypeNullableAttr::CreateImplicit(Context)); -} - void Sema::ActOnPragmaOptionsAlign(PragmaOptionsAlignKind Kind, SourceLocation PragmaLoc) { PragmaMsStackAction Action = Sema::PSK_Reset; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index f4caa487dea1f..550edeb5709a9 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -27,7 +27,6 @@ #include "clang/AST/ExprObjC.h" #include "clang/AST/ExprOpenMP.h" #include "clang/AST/FormatString.h" -#include "clang/AST/IgnoreExpr.h" #include "clang/AST/NSAPI.h" #include "clang/AST/NonTrivialTypeVisitor.h" #include "clang/AST/OperationKinds.h" @@ -5651,6 +5650,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { case Builtin::BI__builtin_elementwise_log2: case Builtin::BI__builtin_elementwise_log10: case Builtin::BI__builtin_elementwise_pow: + case Builtin::BI__builtin_elementwise_roundeven: case Builtin::BI__builtin_elementwise_sin: case Builtin::BI__builtin_elementwise_sqrt: case Builtin::BI__builtin_elementwise_trunc: { @@ -7609,14 +7609,6 @@ bool Sema::getFormatStringInfo(const FormatAttr *Format, bool IsCXXMember, /// /// Returns true if the value evaluates to null. static bool CheckNonNullExpr(Sema &S, const Expr *Expr) { - // Treat (smart) pointers constructed from nullptr as null, whether we can - // const-evaluate them or not. - // This must happen first: the smart pointer expr might have _Nonnull type! - if (isa( - IgnoreExprNodes(Expr, IgnoreImplicitAsWrittenSingleStep, - IgnoreElidableImplicitConstructorSingleStep))) - return true; - // If the expression has non-null type, it doesn't evaluate to null. if (auto nullability = Expr->IgnoreImplicit()->getType()->getNullability()) { if (*nullability == NullabilityKind::NonNull) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 19a52a2d70379..5027deda0d7e0 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -9915,7 +9915,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, // FIXME: We need a better way to separate C++ standard and clang modules. bool ImplicitInlineCXX20 = !getLangOpts().CPlusPlusModules || !NewFD->getOwningModule() || - NewFD->getOwningModule()->isGlobalModule() || + NewFD->isFromExplicitGlobalModule() || NewFD->getOwningModule()->isHeaderLikeModule(); bool isInline = D.getDeclSpec().isInlineSpecified(); bool isVirtual = D.getDeclSpec().isVirtualSpecified(); @@ -18170,7 +18170,9 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, cast_or_null(PrevDecl)); } - if (OOK != OOK_Outside && TUK == TUK_Definition && !getLangOpts().CPlusPlus) + // Only C23 and later allow defining new types in 'offsetof()'. + if (OOK != OOK_Outside && TUK == TUK_Definition && !getLangOpts().CPlusPlus && + !getLangOpts().C23) Diag(New->getLocation(), diag::ext_type_defined_in_offsetof) << (OOK == OOK_Macro) << New->getSourceRange(); @@ -18317,10 +18319,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc, if (PrevDecl) mergeDeclAttributes(New, PrevDecl); - if (auto *CXXRD = dyn_cast(New)) { + if (auto *CXXRD = dyn_cast(New)) inferGslOwnerPointerAttribute(CXXRD); - inferNullableClassAttribute(CXXRD); - } // If there's a #pragma GCC visibility in scope, set the visibility of this // record. diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 8bce04640e748..f25f3afd0f4af 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5982,20 +5982,6 @@ static void handleBuiltinAliasAttr(Sema &S, Decl *D, D->addAttr(::new (S.Context) BuiltinAliasAttr(S.Context, AL, Ident)); } -static void handleNullableTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) { - if (AL.isUsedAsTypeAttr()) - return; - - if (auto *CRD = dyn_cast(D); - !CRD || !(CRD->isClass() || CRD->isStruct())) { - S.Diag(AL.getRange().getBegin(), diag::err_attribute_wrong_decl_type_str) - << AL << AL.isRegularKeywordAttribute() << "classes"; - return; - } - - handleSimpleAttribute(S, D, AL); -} - static void handlePreferredTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (!AL.hasParsedType()) { S.Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1; @@ -9947,10 +9933,6 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_UsingIfExists: handleSimpleAttribute(S, D, AL); break; - - case ParsedAttr::AT_TypeNullable: - handleNullableTypeAttr(S, D, AL); - break; } } diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 3382d56303d62..dce225a7204da 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -7079,11 +7079,6 @@ PerformConstructorInitialization(Sema &S, hasCopyOrMoveCtorParam(S.Context, getConstructorInfo(Step.Function.FoundDecl)); - // A smart pointer constructed from a nullable pointer is nullable. - if (NumArgs == 1 && !Kind.isExplicitCast()) - S.diagnoseNullableToNonnullConversion( - Entity.getType(), Args.front()->getType(), Kind.getLocation()); - // Determine the arguments required to actually perform the constructor // call. if (S.CompleteConstructorCall(Constructor, Step.Type, Args, Loc, diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index de0c2e7399632..51450e486eaeb 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -14811,13 +14811,6 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, } } - // Check for nonnull = nullable. - // This won't be caught in the arg's initialization: the parameter to - // the assignment operator is not marked nonnull. - if (Op == OO_Equal) - diagnoseNullableToNonnullConversion(Args[0]->getType(), - Args[1]->getType(), OpLoc); - // Convert the arguments. if (CXXMethodDecl *Method = dyn_cast(FnDecl)) { // Best->Access is only meaningful for class members. diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index de728305d55aa..9cd19d711af4d 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1836,7 +1836,27 @@ static TemplateParameterList *GetTemplateParameterList(TemplateDecl *TD) { // Make sure we get the template parameter list from the most // recent declaration, since that is the only one that is guaranteed to // have all the default template argument information. - return cast(TD->getMostRecentDecl())->getTemplateParameters(); + Decl *D = TD->getMostRecentDecl(); + // C++11 N3337 [temp.param]p12: + // A default template argument shall not be specified in a friend class + // template declaration. + // + // Skip past friend *declarations* because they are not supposed to contain + // default template arguments. Moreover, these declarations may introduce + // template parameters living in different template depths than the + // corresponding template parameters in TD, causing unmatched constraint + // substitution. + // + // FIXME: Diagnose such cases within a class template: + // template + // struct S { + // template friend struct C; + // }; + // template struct S; + while (D->getFriendObjectKind() != Decl::FriendObjectKind::FOK_None && + D->getPreviousDecl()) + D = D->getPreviousDecl(); + return cast(D)->getTemplateParameters(); } DeclResult Sema::CheckClassTemplate( @@ -2171,7 +2191,6 @@ DeclResult Sema::CheckClassTemplate( AddPushedVisibilityAttribute(NewClass); inferGslOwnerPointerAttribute(NewClass); - inferNullableClassAttribute(NewClass); if (TUK != TUK_Friend) { // Per C++ [basic.scope.temp]p2, skip the template parameter scopes. diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index ab954f9eb6958..9006f8333c76a 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -4715,18 +4715,6 @@ static bool DiagnoseMultipleAddrSpaceAttributes(Sema &S, LangAS ASOld, return false; } -// Whether this is a type broadly expected to have nullability attached. -// These types are affected by `#pragma assume_nonnull`, and missing nullability -// will be diagnosed with -Wnullability-completeness. -static bool shouldHaveNullability(QualType T) { - return T->canHaveNullability(/*ResultIfUnknown=*/false) && - // For now, do not infer/require nullability on C++ smart pointers. - // It's unclear whether the pragma's behavior is useful for C++. - // e.g. treating type-aliases and template-type-parameters differently - // from types of declarations can be surprising. - !isa(T->getCanonicalTypeInternal()); -} - static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, QualType declSpecType, TypeSourceInfo *TInfo) { @@ -4845,7 +4833,8 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, // inner pointers. complainAboutMissingNullability = CAMN_InnerPointers; - if (shouldHaveNullability(T) && !T->getNullability()) { + if (T->canHaveNullability(/*ResultIfUnknown*/ false) && + !T->getNullability()) { // Note that we allow but don't require nullability on dependent types. ++NumPointersRemaining; } @@ -5068,7 +5057,8 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state, // If the type itself could have nullability but does not, infer pointer // nullability and perform consistency checking. if (S.CodeSynthesisContexts.empty()) { - if (shouldHaveNullability(T) && !T->getNullability()) { + if (T->canHaveNullability(/*ResultIfUnknown*/ false) && + !T->getNullability()) { if (isVaList(T)) { // Record that we've seen a pointer, but do nothing else. if (NumPointersRemaining > 0) diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 2cc7f21bf60c4..1e5734c9c834e 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -3195,6 +3195,10 @@ uint64_t ASTWriter::WriteDeclContextLexicalBlock(ASTContext &Context, if (DC->decls_empty()) return 0; + // In reduced BMI, we don't care the declarations in functions. + if (GeneratingReducedBMI && DC->isFunctionOrMethod()) + return 0; + uint64_t Offset = Stream.GetCurrentBitNo(); SmallVector KindDeclPairs; for (const auto *D : DC->decls()) { diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp index 33b43417a6613..492b8f1e2b386 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp @@ -363,20 +363,22 @@ class DependencyScanningAction : public tooling::ToolAction { PrebuiltModuleVFSMap, ScanInstance.getDiagnostics())) return false; - // Use the dependency scanning optimized file system if requested to do so. - if (DepFS) { - llvm::IntrusiveRefCntPtr LocalDepFS = - DepFS; - ScanInstance.getPreprocessorOpts().DependencyDirectivesForFile = - [LocalDepFS = std::move(LocalDepFS)](FileEntryRef File) - -> std::optional> { - if (llvm::ErrorOr Entry = - LocalDepFS->getOrCreateFileSystemEntry(File.getName())) - if (LocalDepFS->ensureDirectiveTokensArePopulated(*Entry)) - return Entry->getDirectiveTokens(); - return std::nullopt; - }; - } + auto AdjustCI = [&](CompilerInstance &CI) { + // Set up the dependency scanning file system callback if requested. + if (DepFS) { + auto GetDependencyDirectives = [LocalDepFS = DepFS](FileEntryRef File) + -> std::optional> { + if (llvm::ErrorOr Entry = + LocalDepFS->getOrCreateFileSystemEntry(File.getName())) + if (LocalDepFS->ensureDirectiveTokensArePopulated(*Entry)) + return Entry->getDirectiveTokens(); + return std::nullopt; + }; + + CI.getPreprocessor().setDependencyDirectivesFn( + std::move(GetDependencyDirectives)); + } + }; // Create the dependency collector that will collect the produced // dependencies. @@ -428,9 +430,11 @@ class DependencyScanningAction : public tooling::ToolAction { std::unique_ptr Action; if (ModuleName) - Action = std::make_unique(*ModuleName); + Action = std::make_unique( + *ModuleName, std::move(AdjustCI)); else - Action = std::make_unique(); + Action = + std::make_unique(std::move(AdjustCI)); if (ScanInstance.getDiagnostics().hasErrorOccurred()) return false; diff --git a/clang/test/APINotes/Inputs/Headers/Templates.apinotes b/clang/test/APINotes/Inputs/Headers/Templates.apinotes new file mode 100644 index 0000000000000..b7336484da0c7 --- /dev/null +++ b/clang/test/APINotes/Inputs/Headers/Templates.apinotes @@ -0,0 +1,5 @@ +--- +Name: Templates +Tags: +- Name: Box + SwiftImportAs: owned diff --git a/clang/test/APINotes/Inputs/Headers/Templates.h b/clang/test/APINotes/Inputs/Headers/Templates.h new file mode 100644 index 0000000000000..862035fee363f --- /dev/null +++ b/clang/test/APINotes/Inputs/Headers/Templates.h @@ -0,0 +1,9 @@ +template +struct Box { + T value; + + const T& get_value() const { return value; } + const T* get_ptr() const { return &value; } +}; + +using IntBox = Box; diff --git a/clang/test/APINotes/Inputs/Headers/module.modulemap b/clang/test/APINotes/Inputs/Headers/module.modulemap index 99fb1aec86481..d515169184f4f 100644 --- a/clang/test/APINotes/Inputs/Headers/module.modulemap +++ b/clang/test/APINotes/Inputs/Headers/module.modulemap @@ -36,6 +36,10 @@ module Namespaces { header "Namespaces.h" } +module Templates { + header "Templates.h" +} + module SwiftImportAs { header "SwiftImportAs.h" } diff --git a/clang/test/APINotes/templates.cpp b/clang/test/APINotes/templates.cpp new file mode 100644 index 0000000000000..d4dce291615e1 --- /dev/null +++ b/clang/test/APINotes/templates.cpp @@ -0,0 +1,9 @@ +// RUN: rm -rf %t && mkdir -p %t +// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Tmpl -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -x c++ +// RUN: %clang_cc1 -fmodules -fblocks -fimplicit-module-maps -fmodules-cache-path=%t/ModulesCache/Tmpl -fdisable-module-hash -fapinotes-modules -fsyntax-only -I %S/Inputs/Headers -F %S/Inputs/Frameworks %s -ast-dump -ast-dump-filter Box -x c++ | FileCheck -check-prefix=CHECK-BOX %s + +#include "Templates.h" + +// CHECK-BOX: Dumping Box: +// CHECK-BOX-NEXT: ClassTemplateDecl {{.+}} imported in Templates Box +// CHECK-BOX: SwiftAttrAttr {{.+}} <> "import_owned" diff --git a/clang/test/C/C2x/n2350.c b/clang/test/C/C2x/n2350.c index 2f738488a3742..af0ca6d79be5e 100644 --- a/clang/test/C/C2x/n2350.c +++ b/clang/test/C/C2x/n2350.c @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -fsyntax-only -pedantic -Wno-comment -std=c99 -verify %s // RUN: %clang_cc1 -fsyntax-only -pedantic -Wno-comment -std=c11 -verify %s // RUN: %clang_cc1 -fsyntax-only -pedantic -Wno-comment -std=c17 -verify %s -// RUN: %clang_cc1 -fsyntax-only -pedantic -Wno-comment -std=c2x -verify %s +// RUN: %clang_cc1 -fsyntax-only -pedantic -Wno-comment -std=c2x -verify=silent %s // silent-no-diagnostics @@ -13,10 +13,10 @@ // https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2350.htm int simple(void) { return __builtin_offsetof(struct A // cpp-error {{'A' cannot be defined in a type specifier}} \ - expected-warning {{defining a type within '__builtin_offsetof' is a Clang extension}} + expected-warning {{defining a type within '__builtin_offsetof' is a C23 extension}} { int a; - struct B // expected-warning {{defining a type within '__builtin_offsetof' is a Clang extension}} + struct B // expected-warning {{defining a type within '__builtin_offsetof' is a C23 extension}} { int c; int d; @@ -26,7 +26,7 @@ int simple(void) { int anonymous_struct(void) { return __builtin_offsetof(struct // cpp-error-re {{'(unnamed struct at {{.*}})' cannot be defined in a type specifier}} \ - expected-warning {{defining a type within '__builtin_offsetof' is a Clang extension}} + expected-warning {{defining a type within '__builtin_offsetof' is a C23 extension}} { int a; int b; @@ -47,7 +47,7 @@ int struct_in_second_param(void) { int macro(void) { return offsetof(struct A // cpp-error {{'A' cannot be defined in a type specifier}} \ - expected-warning 2 {{defining a type within 'offsetof' is a Clang extension}} + expected-warning 2 {{defining a type within 'offsetof' is a C23 extension}} { int a; struct B // verifier seems to think the error is emitted by the macro diff --git a/clang/test/C/drs/dr4xx.c b/clang/test/C/drs/dr4xx.c index 30145dcfeef16..83d7b94cd6795 100644 --- a/clang/test/C/drs/dr4xx.c +++ b/clang/test/C/drs/dr4xx.c @@ -1,7 +1,7 @@ -/* RUN: %clang_cc1 -std=c89 -verify=expected,c89only -pedantic -Wno-c11-extensions %s - RUN: %clang_cc1 -std=c99 -verify=expected -pedantic -Wno-c11-extensions %s - RUN: %clang_cc1 -std=c11 -verify=expected -pedantic %s - RUN: %clang_cc1 -std=c17 -verify=expected -pedantic %s +/* RUN: %clang_cc1 -std=c89 -verify=expected,c89only,pre-c23 -pedantic -Wno-c11-extensions %s + RUN: %clang_cc1 -std=c99 -verify=expected,pre-c23 -pedantic -Wno-c11-extensions %s + RUN: %clang_cc1 -std=c11 -verify=expected,pre-c23 -pedantic %s + RUN: %clang_cc1 -std=c17 -verify=expected,pre-c23 -pedantic %s RUN: %clang_cc1 -std=c2x -verify=expected -pedantic %s */ @@ -343,10 +343,13 @@ void dr496(void) { */ /* The DR asked a question about whether defining a new type within offsetof - * is allowed. C2x N2350 made this explicitly undefined behavior, but GCC and - * Clang both support it as an extension. + * is allowed. C23 N2350 had made this explicitly undefined behavior, but this + * was later overturned when C23 DE-137 was accepted, making it well-formed. + * + * Additionally, GCC and Clang both support it as an extension in pre-C23 + * mode. */ - (void)__builtin_offsetof(struct S { int a; }, a); /* expected-warning{{defining a type within '__builtin_offsetof' is a Clang extension}} */ + (void)__builtin_offsetof(struct S { int a; }, a); /* pre-c23-warning{{defining a type within '__builtin_offsetof' is a C23 extension}} */ } /* WG14 DR499: yes diff --git a/clang/test/CodeGen/aapcs-bitfield-access-unit.c b/clang/test/CodeGen/aapcs-bitfield-access-unit.c new file mode 100644 index 0000000000000..e95dba1c5f50c --- /dev/null +++ b/clang/test/CodeGen/aapcs-bitfield-access-unit.c @@ -0,0 +1,231 @@ +// RUN: %clang_cc1 -triple armv8-none-linux-eabi -fno-aapcs-bitfield-width -fdump-record-layouts-simple -emit-llvm -o /dev/null %s | FileCheck %s -check-prefixes=LAYOUT +// RUN: %clang_cc1 -triple armebv8-none-linux-eabi -fno-aapcs-bitfield-width -fdump-record-layouts-simple -emit-llvm -o /dev/null %s | FileCheck %s -check-prefixes=LAYOUT + +// RUN: %clang_cc1 -triple armv8-none-linux-eabi -faapcs-bitfield-width -fdump-record-layouts-simple -emit-llvm -o /dev/null %s | FileCheck %s -check-prefixes=LAYOUT +// RUN: %clang_cc1 -triple armebv8-none-linux-eabi -faapcs-bitfield-width -fdump-record-layouts-simple -emit-llvm -o /dev/null %s | FileCheck %s -check-prefixes=LAYOUT + +struct st0 { + short c : 7; +} st0; +// LAYOUT-LABEL: LLVMType:%struct.st0 = +// LAYOUT-SAME: type { i8, i8 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st1 { + int a : 10; + short c : 6; +} st1; +// LAYOUT-LABEL: LLVMType:%struct.st1 = +// LAYOUT-SAME: type { i16, [2 x i8] } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st2 { + int a : 10; + short c : 7; +} st2; +// LAYOUT-LABEL: LLVMType:%struct.st2 = +// LAYOUT-SAME: type { i32 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st3 { + volatile short c : 7; +} st3; +// LAYOUT-LABEL: LLVMType:%struct.st3 = +// LAYOUT-SAME: type { i8, i8 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st4 { + int b : 9; + volatile char c : 5; +} st4; +// LAYOUT-LABEL: LLVMType:%struct.st4 = +// LAYOUT-SAME: type { i16, [2 x i8] } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st5 { + int a : 12; + volatile char c : 5; +} st5; +// LAYOUT-LABEL: LLVMType:%struct.st5 = +// LAYOUT-SAME: type { i32 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st6 { + int a : 12; + char b; + int c : 5; +} st6; +// LAYOUT-LABEL: LLVMType:%struct.st6 = +// LAYOUT-SAME: type { i16, i8, i8 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st7a { + char a; + int b : 5; +} st7a; +// LAYOUT-LABEL: LLVMType:%struct.st7a = +// LAYOUT-SAME: type { i8, i8, [2 x i8] } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st7b { + char x; + volatile struct st7a y; +} st7b; +// LAYOUT-LABEL: LLVMType:%struct.st7b = +// LAYOUT-SAME: type { i8, [3 x i8], %struct.st7a } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: ]> + +struct st8 { + unsigned f : 16; +} st8; +// LAYOUT-LABEL: LLVMType:%struct.st8 = +// LAYOUT-SAME: type { i16, [2 x i8] } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st9{ + int f : 8; +} st9; +// LAYOUT-LABEL: LLVMType:%struct.st9 = +// LAYOUT-SAME: type { i8, [3 x i8] } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st10{ + int e : 1; + int f : 8; +} st10; +// LAYOUT-LABEL: LLVMType:%struct.st10 = +// LAYOUT-SAME: type { i16, [2 x i8] } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st11{ + char e; + int f : 16; +} st11; +// LAYOUT-LABEL: LLVMType:%struct.st11 = +// LAYOUT-SAME: type <{ i8, i16, i8 }> +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st12{ + int e : 8; + int f : 16; +} st12; +// LAYOUT-LABEL: LLVMType:%struct.st12 = +// LAYOUT-SAME: type { i32 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st13 { + char a : 8; + int b : 32; +} __attribute__((packed)) st13; +// LAYOUT-LABEL: LLVMType:%struct.st13 = +// LAYOUT-SAME: type <{ i8, i32 }> +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st14 { + char a : 8; +} __attribute__((packed)) st14; +// LAYOUT-LABEL: LLVMType:%struct.st14 = +// LAYOUT-SAME: type { i8 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st15 { + short a : 8; +} __attribute__((packed)) st15; +// LAYOUT-LABEL: LLVMType:%struct.st15 = +// LAYOUT-SAME: type { i8 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st16 { + int a : 32; + int b : 16; + int c : 32; + int d : 16; +} st16; +// LAYOUT-LABEL: LLVMType:%struct.st16 = +// LAYOUT-SAME: type { i32, i16, i32, i16 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct st17 { +int b : 32; +char c : 8; +} __attribute__((packed)) st17; +// LAYOUT-LABEL: LLVMType:%struct.st17 = +// LAYOUT-SAME: type <{ i32, i8 }> +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct zero_bitfield { + int a : 8; + char : 0; + int b : 8; +} st18; +// LAYOUT-LABEL: LLVMType:%struct.zero_bitfield = +// LAYOUT-SAME: type { i8, i8, [2 x i8] } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + +struct zero_bitfield_ok { + short a : 8; + char a1 : 8; + long : 0; + int b : 24; +} st19; +// LAYOUT-LABEL: LLVMType:%struct.zero_bitfield_ok = +// LAYOUT-SAME: type { i16, i32 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + + diff --git a/clang/test/CodeGen/aapcs-bitfield.c b/clang/test/CodeGen/aapcs-bitfield.c index 152ee26e7a3ea..0df250d4ebc53 100644 --- a/clang/test/CodeGen/aapcs-bitfield.c +++ b/clang/test/CodeGen/aapcs-bitfield.c @@ -299,77 +299,73 @@ struct st2 { // LE-LABEL: @st2_check_load( // LE-NEXT: entry: -// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LE-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// LE-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 1 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 1 -// LE-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i16 +// LE-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// LE-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 9 +// LE-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 25 +// LE-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i16 // LE-NEXT: [[CONV:%.*]] = sext i16 [[BF_CAST]] to i32 // LE-NEXT: ret i32 [[CONV]] // // BE-LABEL: @st2_check_load( // BE-NEXT: entry: -// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BE-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_LOAD]], 1 -// BE-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i16 +// BE-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// BE-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 +// BE-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 25 +// BE-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i16 // BE-NEXT: [[CONV:%.*]] = sext i16 [[BF_CAST]] to i32 // BE-NEXT: ret i32 [[CONV]] // // LENUMLOADS-LABEL: @st2_check_load( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 1 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 1 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i16 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 9 +// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 25 +// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i16 // LENUMLOADS-NEXT: [[CONV:%.*]] = sext i16 [[BF_CAST]] to i32 // LENUMLOADS-NEXT: ret i32 [[CONV]] // // BENUMLOADS-LABEL: @st2_check_load( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_LOAD]], 1 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i16 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 +// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 25 +// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i16 // BENUMLOADS-NEXT: [[CONV:%.*]] = sext i16 [[BF_CAST]] to i32 // BENUMLOADS-NEXT: ret i32 [[CONV]] // // LEWIDTH-LABEL: @st2_check_load( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 1 -// LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 1 -// LEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i16 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 9 +// LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 25 +// LEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i16 // LEWIDTH-NEXT: [[CONV:%.*]] = sext i16 [[BF_CAST]] to i32 // LEWIDTH-NEXT: ret i32 [[CONV]] // // BEWIDTH-LABEL: @st2_check_load( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_LOAD]], 1 -// BEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i16 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 +// BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 25 +// BEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i16 // BEWIDTH-NEXT: [[CONV:%.*]] = sext i16 [[BF_CAST]] to i32 // BEWIDTH-NEXT: ret i32 [[CONV]] // // LEWIDTHNUM-LABEL: @st2_check_load( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 1 -// LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 1 -// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i16 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 9 +// LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 25 +// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i16 // LEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i16 [[BF_CAST]] to i32 // LEWIDTHNUM-NEXT: ret i32 [[CONV]] // // BEWIDTHNUM-LABEL: @st2_check_load( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_LOAD]], 1 -// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i16 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 +// BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 25 +// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i16 // BEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i16 [[BF_CAST]] to i32 // BEWIDTHNUM-NEXT: ret i32 [[CONV]] // @@ -379,74 +375,66 @@ int st2_check_load(struct st2 *m) { // LE-LABEL: @st2_check_store( // LE-NEXT: entry: -// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LE-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -128 -// LE-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 1 -// LE-NEXT: store i8 [[BF_SET]], ptr [[C]], align 2 +// LE-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// LE-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -8323073 +// LE-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 65536 +// LE-NEXT: store i32 [[BF_SET]], ptr [[M]], align 4 // LE-NEXT: ret void // // BE-LABEL: @st2_check_store( // BE-NEXT: entry: -// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BE-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], 1 -// BE-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 2 -// BE-NEXT: store i8 [[BF_SET]], ptr [[C]], align 2 +// BE-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// BE-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -65025 +// BE-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 512 +// BE-NEXT: store i32 [[BF_SET]], ptr [[M]], align 4 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @st2_check_store( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -128 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 1 -// LENUMLOADS-NEXT: store i8 [[BF_SET]], ptr [[C]], align 2 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -8323073 +// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 65536 +// LENUMLOADS-NEXT: store i32 [[BF_SET]], ptr [[M]], align 4 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @st2_check_store( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], 1 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 2 -// BENUMLOADS-NEXT: store i8 [[BF_SET]], ptr [[C]], align 2 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -65025 +// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 512 +// BENUMLOADS-NEXT: store i32 [[BF_SET]], ptr [[M]], align 4 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @st2_check_store( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -128 -// LEWIDTH-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 1 -// LEWIDTH-NEXT: store i8 [[BF_SET]], ptr [[C]], align 2 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -8323073 +// LEWIDTH-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 65536 +// LEWIDTH-NEXT: store i32 [[BF_SET]], ptr [[M]], align 4 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @st2_check_store( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], 1 -// BEWIDTH-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 2 -// BEWIDTH-NEXT: store i8 [[BF_SET]], ptr [[C]], align 2 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -65025 +// BEWIDTH-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 512 +// BEWIDTH-NEXT: store i32 [[BF_SET]], ptr [[M]], align 4 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @st2_check_store( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -128 -// LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 1 -// LEWIDTHNUM-NEXT: store i8 [[BF_SET]], ptr [[C]], align 2 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -8323073 +// LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 65536 +// LEWIDTHNUM-NEXT: store i32 [[BF_SET]], ptr [[M]], align 4 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @st2_check_store( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[C]], align 2 -// BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], 1 -// BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 2 -// BEWIDTHNUM-NEXT: store i8 [[BF_SET]], ptr [[C]], align 2 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[M:%.*]], align 4 +// BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -65025 +// BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 512 +// BEWIDTHNUM-NEXT: store i32 [[BF_SET]], ptr [[M]], align 4 // BEWIDTHNUM-NEXT: ret void // void st2_check_store(struct st2 *m) { @@ -636,8 +624,8 @@ struct st4 { // // LEWIDTH-LABEL: @st4_check_load( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// LEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 2 // LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 3 // LEWIDTH-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 @@ -645,8 +633,8 @@ struct st4 { // // BEWIDTH-LABEL: @st4_check_load( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// BEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 1 // BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 3 // BEWIDTH-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 @@ -654,8 +642,8 @@ struct st4 { // // LEWIDTHNUM-LABEL: @st4_check_load( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 2 // LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 3 // LEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 @@ -663,8 +651,8 @@ struct st4 { // // BEWIDTHNUM-LABEL: @st4_check_load( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 1 // BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 3 // BEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 @@ -708,38 +696,38 @@ int st4_check_load(struct st4 *m) { // // LEWIDTH-LABEL: @st4_check_store( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// LEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -63 // LEWIDTH-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 2 -// LEWIDTH-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP1]], align 1 +// LEWIDTH-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP0]], align 1 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @st4_check_store( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// BEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -125 // BEWIDTH-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 4 -// BEWIDTH-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP1]], align 1 +// BEWIDTH-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP0]], align 1 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @st4_check_store( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -63 // LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 2 -// LEWIDTHNUM-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP1]], align 1 +// LEWIDTHNUM-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP0]], align 1 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @st4_check_store( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -125 // BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 4 -// BEWIDTHNUM-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP1]], align 1 +// BEWIDTHNUM-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP0]], align 1 // BEWIDTHNUM-NEXT: ret void // void st4_check_store(struct st4 *m) { @@ -821,42 +809,44 @@ struct st5 { // LE-LABEL: @st5_check_load( // LE-NEXT: entry: -// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 -// LE-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 3 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 3 -// LE-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[M:%.*]], align 4 +// LE-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 11 +// LE-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 27 +// LE-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i8 +// LE-NEXT: [[CONV:%.*]] = sext i8 [[BF_CAST]] to i32 // LE-NEXT: ret i32 [[CONV]] // // BE-LABEL: @st5_check_load( // BE-NEXT: entry: -// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_LOAD]], 3 -// BE-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[M:%.*]], align 4 +// BE-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 +// BE-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 27 +// BE-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i8 +// BE-NEXT: [[CONV:%.*]] = sext i8 [[BF_CAST]] to i32 // BE-NEXT: ret i32 [[CONV]] // // LENUMLOADS-LABEL: @st5_check_load( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 3 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 3 -// LENUMLOADS-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[M:%.*]], align 4 +// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 11 +// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 27 +// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i8 +// LENUMLOADS-NEXT: [[CONV:%.*]] = sext i8 [[BF_CAST]] to i32 // LENUMLOADS-NEXT: ret i32 [[CONV]] // // BENUMLOADS-LABEL: @st5_check_load( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_LOAD]], 3 -// BENUMLOADS-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[M:%.*]], align 4 +// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 +// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 27 +// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i32 [[BF_ASHR]] to i8 +// BENUMLOADS-NEXT: [[CONV:%.*]] = sext i8 [[BF_CAST]] to i32 // BENUMLOADS-NEXT: ret i32 [[CONV]] // // LEWIDTH-LABEL: @st5_check_load( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 +// LEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 2 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 2 // LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 3 // LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 3 // LEWIDTH-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 @@ -864,16 +854,16 @@ struct st5 { // // BEWIDTH-LABEL: @st5_check_load( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 +// BEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 2 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 2 // BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_LOAD]], 3 // BEWIDTH-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 // BEWIDTH-NEXT: ret i32 [[CONV]] // // LEWIDTHNUM-LABEL: @st5_check_load( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 2 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 2 // LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i8 [[BF_LOAD]], 3 // LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_SHL]], 3 // LEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 @@ -881,8 +871,8 @@ struct st5 { // // BEWIDTHNUM-LABEL: @st5_check_load( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 2 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 2 // BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i8 [[BF_LOAD]], 3 // BEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i8 [[BF_ASHR]] to i32 // BEWIDTHNUM-NEXT: ret i32 [[CONV]] @@ -893,74 +883,70 @@ int st5_check_load(struct st5 *m) { // LE-LABEL: @st5_check_store( // LE-NEXT: entry: -// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -32 -// LE-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 1 -// LE-NEXT: store volatile i8 [[BF_SET]], ptr [[C]], align 2 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[M:%.*]], align 4 +// LE-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -2031617 +// LE-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 65536 +// LE-NEXT: store volatile i32 [[BF_SET]], ptr [[M]], align 4 // LE-NEXT: ret void // // BE-LABEL: @st5_check_store( // BE-NEXT: entry: -// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], 7 -// BE-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 8 -// BE-NEXT: store volatile i8 [[BF_SET]], ptr [[C]], align 2 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[M:%.*]], align 4 +// BE-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -63489 +// BE-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 2048 +// BE-NEXT: store volatile i32 [[BF_SET]], ptr [[M]], align 4 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @st5_check_store( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -32 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 1 -// LENUMLOADS-NEXT: store volatile i8 [[BF_SET]], ptr [[C]], align 2 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[M:%.*]], align 4 +// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -2031617 +// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 65536 +// LENUMLOADS-NEXT: store volatile i32 [[BF_SET]], ptr [[M]], align 4 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @st5_check_store( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], 7 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 8 -// BENUMLOADS-NEXT: store volatile i8 [[BF_SET]], ptr [[C]], align 2 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[M:%.*]], align 4 +// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -63489 +// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], 2048 +// BENUMLOADS-NEXT: store volatile i32 [[BF_SET]], ptr [[M]], align 4 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @st5_check_store( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 +// LEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 2 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 2 // LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -32 // LEWIDTH-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 1 -// LEWIDTH-NEXT: store volatile i8 [[BF_SET]], ptr [[C]], align 2 +// LEWIDTH-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP0]], align 2 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @st5_check_store( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 +// BEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 2 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 2 // BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], 7 // BEWIDTH-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 8 -// BEWIDTH-NEXT: store volatile i8 [[BF_SET]], ptr [[C]], align 2 +// BEWIDTH-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP0]], align 2 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @st5_check_store( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 2 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 2 // LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], -32 // LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 1 -// LEWIDTHNUM-NEXT: store volatile i8 [[BF_SET]], ptr [[C]], align 2 +// LEWIDTHNUM-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP0]], align 2 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @st5_check_store( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST5:%.*]], ptr [[M:%.*]], i32 0, i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 2 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[M:%.*]], i32 2 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP0]], align 2 // BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i8 [[BF_LOAD]], 7 // BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 8 -// BEWIDTHNUM-NEXT: store volatile i8 [[BF_SET]], ptr [[C]], align 2 +// BEWIDTHNUM-NEXT: store volatile i8 [[BF_SET]], ptr [[TMP0]], align 2 // BEWIDTHNUM-NEXT: ret void // void st5_check_store(struct st5 *m) { @@ -980,8 +966,8 @@ struct st6 { // LE-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 4 // LE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // LE-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[M]], i32 0, i32 1 -// LE-NEXT: [[TMP1:%.*]] = load volatile i8, ptr [[B]], align 2 -// LE-NEXT: [[CONV:%.*]] = sext i8 [[TMP1]] to i32 +// LE-NEXT: [[TMP0:%.*]] = load volatile i8, ptr [[B]], align 2 +// LE-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 // LE-NEXT: [[ADD:%.*]] = add nsw i32 [[BF_CAST]], [[CONV]] // LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[M]], i32 0, i32 2 // LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 @@ -997,8 +983,8 @@ struct st6 { // BE-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_LOAD]], 4 // BE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // BE-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[M]], i32 0, i32 1 -// BE-NEXT: [[TMP1:%.*]] = load volatile i8, ptr [[B]], align 2 -// BE-NEXT: [[CONV:%.*]] = sext i8 [[TMP1]] to i32 +// BE-NEXT: [[TMP0:%.*]] = load volatile i8, ptr [[B]], align 2 +// BE-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 // BE-NEXT: [[ADD:%.*]] = add nsw i32 [[BF_CAST]], [[CONV]] // BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[M]], i32 0, i32 2 // BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 @@ -1014,8 +1000,8 @@ struct st6 { // LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 4 // LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // LENUMLOADS-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[M]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = load volatile i8, ptr [[B]], align 2 -// LENUMLOADS-NEXT: [[CONV:%.*]] = sext i8 [[TMP1]] to i32 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = load volatile i8, ptr [[B]], align 2 +// LENUMLOADS-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 // LENUMLOADS-NEXT: [[ADD:%.*]] = add nsw i32 [[BF_CAST]], [[CONV]] // LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[M]], i32 0, i32 2 // LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 @@ -1031,8 +1017,8 @@ struct st6 { // BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_LOAD]], 4 // BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // BENUMLOADS-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[M]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = load volatile i8, ptr [[B]], align 2 -// BENUMLOADS-NEXT: [[CONV:%.*]] = sext i8 [[TMP1]] to i32 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = load volatile i8, ptr [[B]], align 2 +// BENUMLOADS-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 // BENUMLOADS-NEXT: [[ADD:%.*]] = add nsw i32 [[BF_CAST]], [[CONV]] // BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[M]], i32 0, i32 2 // BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 @@ -1048,8 +1034,8 @@ struct st6 { // LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 4 // LEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // LEWIDTH-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[M]], i32 0, i32 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = load volatile i8, ptr [[B]], align 2 -// LEWIDTH-NEXT: [[CONV:%.*]] = sext i8 [[TMP1]] to i32 +// LEWIDTH-NEXT: [[TMP0:%.*]] = load volatile i8, ptr [[B]], align 2 +// LEWIDTH-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 // LEWIDTH-NEXT: [[ADD:%.*]] = add nsw i32 [[BF_CAST]], [[CONV]] // LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[M]], i32 0, i32 2 // LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 @@ -1065,8 +1051,8 @@ struct st6 { // BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_LOAD]], 4 // BEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // BEWIDTH-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[M]], i32 0, i32 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = load volatile i8, ptr [[B]], align 2 -// BEWIDTH-NEXT: [[CONV:%.*]] = sext i8 [[TMP1]] to i32 +// BEWIDTH-NEXT: [[TMP0:%.*]] = load volatile i8, ptr [[B]], align 2 +// BEWIDTH-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 // BEWIDTH-NEXT: [[ADD:%.*]] = add nsw i32 [[BF_CAST]], [[CONV]] // BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[M]], i32 0, i32 2 // BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 @@ -1082,8 +1068,8 @@ struct st6 { // LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 4 // LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // LEWIDTHNUM-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[M]], i32 0, i32 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = load volatile i8, ptr [[B]], align 2 -// LEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i8 [[TMP1]] to i32 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = load volatile i8, ptr [[B]], align 2 +// LEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 // LEWIDTHNUM-NEXT: [[ADD:%.*]] = add nsw i32 [[BF_CAST]], [[CONV]] // LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[M]], i32 0, i32 2 // LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 @@ -1099,8 +1085,8 @@ struct st6 { // BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_LOAD]], 4 // BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // BEWIDTHNUM-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[M]], i32 0, i32 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = load volatile i8, ptr [[B]], align 2 -// BEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i8 [[TMP1]] to i32 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = load volatile i8, ptr [[B]], align 2 +// BEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32 // BEWIDTHNUM-NEXT: [[ADD:%.*]] = add nsw i32 [[BF_CAST]], [[CONV]] // BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[M]], i32 0, i32 2 // BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 @@ -1704,9 +1690,9 @@ void store_st9(volatile struct st9 *m) { // LE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[M:%.*]], align 4 // LE-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 -// LE-NEXT: store volatile i8 [[TMP1]], ptr [[M]], align 4 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// LE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 +// LE-NEXT: store volatile i8 [[TMP0]], ptr [[M]], align 4 +// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // LE-NEXT: ret void // // BE-LABEL: @increment_st9( @@ -1714,9 +1700,9 @@ void store_st9(volatile struct st9 *m) { // BE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[M:%.*]], align 4 // BE-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 -// BE-NEXT: store volatile i8 [[TMP1]], ptr [[M]], align 4 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// BE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 +// BE-NEXT: store volatile i8 [[TMP0]], ptr [[M]], align 4 +// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_st9( @@ -1724,10 +1710,10 @@ void store_st9(volatile struct st9 *m) { // LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[M:%.*]], align 4 // LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 // LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[M]], align 4 -// LENUMLOADS-NEXT: store volatile i8 [[TMP1]], ptr [[M]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// LENUMLOADS-NEXT: store volatile i8 [[TMP0]], ptr [[M]], align 4 +// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_st9( @@ -1735,10 +1721,10 @@ void store_st9(volatile struct st9 *m) { // BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[M:%.*]], align 4 // BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 // BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[M]], align 4 -// BENUMLOADS-NEXT: store volatile i8 [[TMP1]], ptr [[M]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// BENUMLOADS-NEXT: store volatile i8 [[TMP0]], ptr [[M]], align 4 +// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_st9( @@ -1949,9 +1935,9 @@ void store_st10(volatile struct st10 *m) { // LE-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 8 // LE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i16 +// LE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 // LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i16, ptr [[M]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP1]], 255 +// LE-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP0]], 255 // LE-NEXT: [[BF_SHL2:%.*]] = shl i16 [[BF_VALUE]], 1 // LE-NEXT: [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD1]], -511 // LE-NEXT: [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_SHL2]] @@ -1968,9 +1954,9 @@ void store_st10(volatile struct st10 *m) { // BE-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 8 // BE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i16 +// BE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 // BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i16, ptr [[M]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP1]], 255 +// BE-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP0]], 255 // BE-NEXT: [[BF_SHL2:%.*]] = shl i16 [[BF_VALUE]], 7 // BE-NEXT: [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD1]], -32641 // BE-NEXT: [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_SHL2]] @@ -1987,9 +1973,9 @@ void store_st10(volatile struct st10 *m) { // LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 8 // LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i16 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 // LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i16, ptr [[M]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP1]], 255 +// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP0]], 255 // LENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i16 [[BF_VALUE]], 1 // LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD1]], -511 // LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_SHL2]] @@ -2006,9 +1992,9 @@ void store_st10(volatile struct st10 *m) { // BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 8 // BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_ASHR]] to i32 // BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i16 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 // BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i16, ptr [[M]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP1]], 255 +// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP0]], 255 // BENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i16 [[BF_VALUE]], 7 // BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD1]], -32641 // BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_SHL2]] @@ -2767,146 +2753,70 @@ struct st13 { // LE-LABEL: @increment_b_st13( // LE-NEXT: entry: -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 8 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LE-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// LE-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 255 -// LE-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// LE-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST13:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[B]], align 1 +// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LE-NEXT: store volatile i32 [[INC]], ptr [[B]], align 1 // LE-NEXT: ret void // // BE-LABEL: @increment_b_st13( // BE-NEXT: entry: -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BE-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 8 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 8 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BE-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -4294967296 -// BE-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// BE-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST13:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[B]], align 1 +// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BE-NEXT: store volatile i32 [[INC]], ptr [[B]], align 1 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_b_st13( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 8 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 255 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// LENUMLOADS-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST13:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[B]], align 1 +// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[B]], align 1 +// LENUMLOADS-NEXT: store volatile i32 [[INC]], ptr [[B]], align 1 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_b_st13( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 8 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 8 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -4294967296 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// BENUMLOADS-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST13:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[B]], align 1 +// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[B]], align 1 +// BENUMLOADS-NEXT: store volatile i32 [[INC]], ptr [[B]], align 1 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_b_st13( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 8 -// LEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 255 -// LEWIDTH-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// LEWIDTH-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// LEWIDTH-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST13:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[B]], align 1 +// LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LEWIDTH-NEXT: store volatile i32 [[INC]], ptr [[B]], align 1 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_b_st13( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 8 -// BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 8 -// BEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -4294967296 -// BEWIDTH-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// BEWIDTH-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// BEWIDTH-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST13:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[B]], align 1 +// BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BEWIDTH-NEXT: store volatile i32 [[INC]], ptr [[B]], align 1 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_b_st13( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 8 -// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 255 -// LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// LEWIDTHNUM-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// LEWIDTHNUM-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST13:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[B]], align 1 +// LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[B]], align 1 +// LEWIDTHNUM-NEXT: store volatile i32 [[INC]], ptr [[B]], align 1 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_b_st13( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 8 -// BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 8 -// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -4294967296 -// BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// BEWIDTHNUM-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// BEWIDTHNUM-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST13:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[B]], align 1 +// BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[B]], align 1 +// BEWIDTHNUM-NEXT: store volatile i32 [[INC]], ptr [[B]], align 1 // BEWIDTHNUM-NEXT: ret void // void increment_b_st13(volatile struct st13 *s) { @@ -2990,9 +2900,9 @@ struct st15 { // LE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 1 // LE-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i16 // LE-NEXT: [[INC:%.*]] = add i16 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = trunc i16 [[INC]] to i8 -// LE-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 1 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i16 +// LE-NEXT: [[TMP0:%.*]] = trunc i16 [[INC]] to i8 +// LE-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 1 +// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i16 // LE-NEXT: ret void // // BE-LABEL: @increment_a_st15( @@ -3000,9 +2910,9 @@ struct st15 { // BE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 1 // BE-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i16 // BE-NEXT: [[INC:%.*]] = add i16 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = trunc i16 [[INC]] to i8 -// BE-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 1 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i16 +// BE-NEXT: [[TMP0:%.*]] = trunc i16 [[INC]] to i8 +// BE-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 1 +// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i16 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_a_st15( @@ -3010,10 +2920,10 @@ struct st15 { // LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 1 // LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i16 // LENUMLOADS-NEXT: [[INC:%.*]] = add i16 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = trunc i16 [[INC]] to i8 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i16 [[INC]] to i8 // LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[S]], align 1 -// LENUMLOADS-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 1 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i16 +// LENUMLOADS-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 1 +// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i16 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_a_st15( @@ -3021,10 +2931,10 @@ struct st15 { // BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 1 // BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i16 // BENUMLOADS-NEXT: [[INC:%.*]] = add i16 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = trunc i16 [[INC]] to i8 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i16 [[INC]] to i8 // BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[S]], align 1 -// BENUMLOADS-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 1 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i16 +// BENUMLOADS-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 1 +// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i16 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_a_st15( @@ -3032,9 +2942,9 @@ struct st15 { // LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 1 // LEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i16 // LEWIDTH-NEXT: [[INC:%.*]] = add i16 [[BF_CAST]], 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = trunc i16 [[INC]] to i8 -// LEWIDTH-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 1 -// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i16 +// LEWIDTH-NEXT: [[TMP0:%.*]] = trunc i16 [[INC]] to i8 +// LEWIDTH-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 1 +// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i16 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_a_st15( @@ -3042,9 +2952,9 @@ struct st15 { // BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 1 // BEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i16 // BEWIDTH-NEXT: [[INC:%.*]] = add i16 [[BF_CAST]], 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = trunc i16 [[INC]] to i8 -// BEWIDTH-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 1 -// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i16 +// BEWIDTH-NEXT: [[TMP0:%.*]] = trunc i16 [[INC]] to i8 +// BEWIDTH-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 1 +// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i16 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_a_st15( @@ -3052,10 +2962,10 @@ struct st15 { // LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 1 // LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i16 // LEWIDTHNUM-NEXT: [[INC:%.*]] = add i16 [[BF_CAST]], 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = trunc i16 [[INC]] to i8 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = trunc i16 [[INC]] to i8 // LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[S]], align 1 -// LEWIDTHNUM-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 1 -// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i16 +// LEWIDTHNUM-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 1 +// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i16 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_a_st15( @@ -3063,10 +2973,10 @@ struct st15 { // BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 1 // BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i16 // BEWIDTHNUM-NEXT: [[INC:%.*]] = add i16 [[BF_CAST]], 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = trunc i16 [[INC]] to i8 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = trunc i16 [[INC]] to i8 // BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[S]], align 1 -// BEWIDTHNUM-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 1 -// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i16 +// BEWIDTHNUM-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 1 +// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i16 // BEWIDTHNUM-NEXT: ret void // void increment_a_st15(volatile struct st15 *s) { @@ -3082,146 +2992,58 @@ struct st16 { // LE-LABEL: @increment_a_st16( // LE-NEXT: entry: -// LE-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// LE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LE-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LE-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[S:%.*]], align 4 +// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LE-NEXT: store i32 [[INC]], ptr [[S]], align 4 // LE-NEXT: ret void // // BE-LABEL: @increment_a_st16( // BE-NEXT: entry: -// BE-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BE-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BE-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[S:%.*]], align 4 +// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BE-NEXT: store i32 [[INC]], ptr [[S]], align 4 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_a_st16( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LENUMLOADS-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[S:%.*]], align 4 +// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LENUMLOADS-NEXT: store i32 [[INC]], ptr [[S]], align 4 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_a_st16( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BENUMLOADS-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[S:%.*]], align 4 +// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BENUMLOADS-NEXT: store i32 [[INC]], ptr [[S]], align 4 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_a_st16( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// LEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LEWIDTH-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTH-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// LEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[S:%.*]], align 4 +// LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LEWIDTH-NEXT: store i32 [[INC]], ptr [[S]], align 4 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_a_st16( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// BEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BEWIDTH-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTH-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// BEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[S:%.*]], align 4 +// BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BEWIDTH-NEXT: store i32 [[INC]], ptr [[S]], align 4 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_a_st16( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// LEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTHNUM-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// LEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[S:%.*]], align 4 +// LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LEWIDTHNUM-NEXT: store i32 [[INC]], ptr [[S]], align 4 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_a_st16( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// BEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTHNUM-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// BEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[S:%.*]], align 4 +// BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BEWIDTHNUM-NEXT: store i32 [[INC]], ptr [[S]], align 4 // BEWIDTHNUM-NEXT: ret void // void increment_a_st16(struct st16 *s) { @@ -3230,154 +3052,90 @@ void increment_a_st16(struct st16 *s) { // LE-LABEL: @increment_b_st16( // LE-NEXT: entry: -// LE-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// LE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LE-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LE-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[B]], align 4 +// LE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LE-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LE-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LE-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LE-NEXT: store i16 [[TMP0]], ptr [[B]], align 4 +// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LE-NEXT: ret void // // BE-LABEL: @increment_b_st16( // BE-NEXT: entry: -// BE-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// BE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BE-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BE-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[B]], align 4 +// BE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BE-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BE-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BE-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BE-NEXT: store i16 [[TMP0]], ptr [[B]], align 4 +// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_b_st16( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LENUMLOADS-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[B]], align 4 +// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LENUMLOADS-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LENUMLOADS-NEXT: store i16 [[TMP0]], ptr [[B]], align 4 +// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_b_st16( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BENUMLOADS-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[B]], align 4 +// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BENUMLOADS-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BENUMLOADS-NEXT: store i16 [[TMP0]], ptr [[B]], align 4 +// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_b_st16( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LEWIDTH-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[B]], align 4 +// LEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// LEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LEWIDTH-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LEWIDTH-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LEWIDTH-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// LEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LEWIDTH-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LEWIDTH-NEXT: store i16 [[TMP0]], ptr [[B]], align 4 +// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_b_st16( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BEWIDTH-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[B]], align 4 +// BEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// BEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BEWIDTH-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BEWIDTH-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BEWIDTH-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// BEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BEWIDTH-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BEWIDTH-NEXT: store i16 [[TMP0]], ptr [[B]], align 4 +// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_b_st16( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LEWIDTHNUM-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[B]], align 4 +// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// LEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LEWIDTHNUM-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LEWIDTHNUM-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// LEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LEWIDTHNUM-NEXT: store i16 [[TMP0]], ptr [[B]], align 4 +// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_b_st16( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[S:%.*]], align 4 -// BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BEWIDTHNUM-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[B]], align 4 +// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[S]], align 4 -// BEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BEWIDTHNUM-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BEWIDTHNUM-NEXT: store i64 [[BF_SET]], ptr [[S]], align 4 -// BEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BEWIDTHNUM-NEXT: store i16 [[TMP0]], ptr [[B]], align 4 +// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BEWIDTHNUM-NEXT: ret void // void increment_b_st16(struct st16 *s) { @@ -3386,154 +3144,66 @@ void increment_b_st16(struct st16 *s) { // LE-LABEL: @increment_c_st16( // LE-NEXT: entry: -// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LE-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[C]], align 4 -// LE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LE-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[C]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LE-NEXT: store i64 [[BF_SET]], ptr [[C]], align 4 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// LE-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[C]], align 4 +// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LE-NEXT: store i32 [[INC]], ptr [[C]], align 4 // LE-NEXT: ret void // // BE-LABEL: @increment_c_st16( // BE-NEXT: entry: -// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BE-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[C]], align 4 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BE-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[C]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BE-NEXT: store i64 [[BF_SET]], ptr [[C]], align 4 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// BE-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[C]], align 4 +// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BE-NEXT: store i32 [[INC]], ptr [[C]], align 4 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_c_st16( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[C]], align 4 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[C]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LENUMLOADS-NEXT: store i64 [[BF_SET]], ptr [[C]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[C]], align 4 +// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LENUMLOADS-NEXT: store i32 [[INC]], ptr [[C]], align 4 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_c_st16( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[C]], align 4 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[C]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BENUMLOADS-NEXT: store i64 [[BF_SET]], ptr [[C]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[C]], align 4 +// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BENUMLOADS-NEXT: store i32 [[INC]], ptr [[C]], align 4 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_c_st16( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[C]], align 4 -// LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[C]], align 4 -// LEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LEWIDTH-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTH-NEXT: store i64 [[BF_SET]], ptr [[C]], align 4 -// LEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[C]], align 4 +// LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LEWIDTH-NEXT: store i32 [[INC]], ptr [[C]], align 4 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_c_st16( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[C]], align 4 -// BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[C]], align 4 -// BEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BEWIDTH-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTH-NEXT: store i64 [[BF_SET]], ptr [[C]], align 4 -// BEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[C]], align 4 +// BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BEWIDTH-NEXT: store i32 [[INC]], ptr [[C]], align 4 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_c_st16( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[C]], align 4 -// LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[C]], align 4 -// LEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTHNUM-NEXT: store i64 [[BF_SET]], ptr [[C]], align 4 -// LEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[C]], align 4 +// LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LEWIDTHNUM-NEXT: store i32 [[INC]], ptr [[C]], align 4 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_c_st16( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[C]], align 4 -// BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[C]], align 4 -// BEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTHNUM-NEXT: store i64 [[BF_SET]], ptr [[C]], align 4 -// BEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[C]], align 4 +// BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BEWIDTHNUM-NEXT: store i32 [[INC]], ptr [[C]], align 4 // BEWIDTHNUM-NEXT: ret void // void increment_c_st16(struct st16 *s) { @@ -3542,162 +3212,90 @@ void increment_c_st16(struct st16 *s) { // LE-LABEL: @increment_d_st16( // LE-NEXT: entry: -// LE-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LE-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[D]], align 4 -// LE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LE-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// LE-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[D]], align 4 +// LE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LE-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[D]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LE-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LE-NEXT: store i64 [[BF_SET]], ptr [[D]], align 4 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LE-NEXT: store i16 [[TMP0]], ptr [[D]], align 4 +// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LE-NEXT: ret void // // BE-LABEL: @increment_d_st16( // BE-NEXT: entry: -// BE-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BE-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[D]], align 4 -// BE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BE-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// BE-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[D]], align 4 +// BE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BE-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[D]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BE-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BE-NEXT: store i64 [[BF_SET]], ptr [[D]], align 4 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BE-NEXT: store i16 [[TMP0]], ptr [[D]], align 4 +// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_d_st16( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[D]], align 4 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LENUMLOADS-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[D]], align 4 +// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[D]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LENUMLOADS-NEXT: store i64 [[BF_SET]], ptr [[D]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LENUMLOADS-NEXT: store i16 [[TMP0]], ptr [[D]], align 4 +// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_d_st16( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[D]], align 4 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BENUMLOADS-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[D]], align 4 +// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[D]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BENUMLOADS-NEXT: store i64 [[BF_SET]], ptr [[D]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BENUMLOADS-NEXT: store i16 [[TMP0]], ptr [[D]], align 4 +// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_d_st16( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[D]], align 4 -// LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LEWIDTH-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[D]], align 4 +// LEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[D]], align 4 -// LEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LEWIDTH-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LEWIDTH-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LEWIDTH-NEXT: store i64 [[BF_SET]], ptr [[D]], align 4 -// LEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LEWIDTH-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LEWIDTH-NEXT: store i16 [[TMP0]], ptr [[D]], align 4 +// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_d_st16( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[D]], align 4 -// BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BEWIDTH-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[D]], align 4 +// BEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[D]], align 4 -// BEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BEWIDTH-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BEWIDTH-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BEWIDTH-NEXT: store i64 [[BF_SET]], ptr [[D]], align 4 -// BEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BEWIDTH-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BEWIDTH-NEXT: store i16 [[TMP0]], ptr [[D]], align 4 +// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_d_st16( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[D]], align 4 -// LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LEWIDTHNUM-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[D]], align 4 +// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[D]], align 4 -// LEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LEWIDTHNUM-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LEWIDTHNUM-NEXT: store i64 [[BF_SET]], ptr [[D]], align 4 -// LEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LEWIDTHNUM-NEXT: store i16 [[TMP0]], ptr [[D]], align 4 +// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_d_st16( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i64, ptr [[D]], align 4 -// BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BEWIDTHNUM-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[D]], align 4 +// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load i64, ptr [[D]], align 4 -// BEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BEWIDTHNUM-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BEWIDTHNUM-NEXT: store i64 [[BF_SET]], ptr [[D]], align 4 -// BEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BEWIDTHNUM-NEXT: store i16 [[TMP0]], ptr [[D]], align 4 +// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BEWIDTHNUM-NEXT: ret void // void increment_d_st16(struct st16 *s) { @@ -3706,74 +3304,32 @@ void increment_d_st16(struct st16 *s) { // LE-LABEL: @increment_v_a_st16( // LE-NEXT: entry: -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[S:%.*]], align 4 -// LE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[S]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LE-NEXT: store volatile i64 [[BF_SET]], ptr [[S]], align 4 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 4 +// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LE-NEXT: store volatile i32 [[INC]], ptr [[S]], align 4 // LE-NEXT: ret void // // BE-LABEL: @increment_v_a_st16( // BE-NEXT: entry: -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[S:%.*]], align 4 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[S]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BE-NEXT: store volatile i64 [[BF_SET]], ptr [[S]], align 4 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 4 +// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BE-NEXT: store volatile i32 [[INC]], ptr [[S]], align 4 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_v_a_st16( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[S:%.*]], align 4 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LENUMLOADS-NEXT: store volatile i64 [[BF_SET]], ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 4 +// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[S]], align 4 +// LENUMLOADS-NEXT: store volatile i32 [[INC]], ptr [[S]], align 4 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_v_a_st16( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[S:%.*]], align 4 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BENUMLOADS-NEXT: store volatile i64 [[BF_SET]], ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 4 +// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[S]], align 4 +// BENUMLOADS-NEXT: store volatile i32 [[INC]], ptr [[S]], align 4 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_v_a_st16( @@ -3812,140 +3368,110 @@ void increment_v_a_st16(volatile struct st16 *s) { // LE-LABEL: @increment_v_b_st16( // LE-NEXT: entry: -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[S:%.*]], align 4 -// LE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LE-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[B]], align 4 +// LE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[S]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LE-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LE-NEXT: store volatile i64 [[BF_SET]], ptr [[S]], align 4 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LE-NEXT: store volatile i16 [[TMP0]], ptr [[B]], align 4 +// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LE-NEXT: ret void // // BE-LABEL: @increment_v_b_st16( // BE-NEXT: entry: -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[S:%.*]], align 4 -// BE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BE-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[B]], align 4 +// BE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[S]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BE-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BE-NEXT: store volatile i64 [[BF_SET]], ptr [[S]], align 4 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BE-NEXT: store volatile i16 [[TMP0]], ptr [[B]], align 4 +// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_v_b_st16( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[S:%.*]], align 4 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LENUMLOADS-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[B]], align 4 +// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LENUMLOADS-NEXT: store volatile i64 [[BF_SET]], ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i16, ptr [[B]], align 4 +// LENUMLOADS-NEXT: store volatile i16 [[TMP0]], ptr [[B]], align 4 +// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_v_b_st16( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[S:%.*]], align 4 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BENUMLOADS-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[B]], align 4 +// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BENUMLOADS-NEXT: store volatile i64 [[BF_SET]], ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i16, ptr [[B]], align 4 +// BENUMLOADS-NEXT: store volatile i16 [[TMP0]], ptr [[B]], align 4 +// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_v_b_st16( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 1 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 1 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 // LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 16 // LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_ASHR]], 1 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // LEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 // LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], -65536 // LEWIDTH-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTH-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP1]], align 4 +// LEWIDTH-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP0]], align 4 // LEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // LEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i32 [[BF_RESULT_SHL]], 16 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_v_b_st16( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 1 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 1 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_LOAD]], 16 // BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_ASHR]], 1 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // BEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 // BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], 65535 // BEWIDTH-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTH-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP1]], align 4 +// BEWIDTH-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP0]], align 4 // BEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // BEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i32 [[BF_RESULT_SHL]], 16 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_v_b_st16( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 // LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 16 // LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_ASHR]], 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // LEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 // LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], -65536 // LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTHNUM-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP1]], align 4 +// LEWIDTHNUM-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP0]], align 4 // LEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // LEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i32 [[BF_RESULT_SHL]], 16 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_v_b_st16( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_LOAD]], 16 // BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_ASHR]], 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // BEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 // BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], 65535 // BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTHNUM-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP1]], align 4 +// BEWIDTHNUM-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP0]], align 4 // BEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // BEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i32 [[BF_RESULT_SHL]], 16 // BEWIDTHNUM-NEXT: ret void @@ -3956,112 +3482,70 @@ void increment_v_b_st16(volatile struct st16 *s) { // LE-LABEL: @increment_v_c_st16( // LE-NEXT: entry: -// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[C]], align 4 -// LE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[C]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LE-NEXT: store volatile i64 [[BF_SET]], ptr [[C]], align 4 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[C]], align 4 +// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LE-NEXT: store volatile i32 [[INC]], ptr [[C]], align 4 // LE-NEXT: ret void // // BE-LABEL: @increment_v_c_st16( // BE-NEXT: entry: -// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[C]], align 4 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[C]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BE-NEXT: store volatile i64 [[BF_SET]], ptr [[C]], align 4 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[C]], align 4 +// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BE-NEXT: store volatile i32 [[INC]], ptr [[C]], align 4 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_v_c_st16( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[C]], align 4 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[C]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294967296 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_VALUE]] -// LENUMLOADS-NEXT: store volatile i64 [[BF_SET]], ptr [[C]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[C]], align 4 +// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[C]], align 4 +// LENUMLOADS-NEXT: store volatile i32 [[INC]], ptr [[C]], align 4 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_v_c_st16( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[C]], align 4 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 -// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[C]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL]] -// BENUMLOADS-NEXT: store volatile i64 [[BF_SET]], ptr [[C]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[C]], align 4 +// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[C]], align 4 +// BENUMLOADS-NEXT: store volatile i32 [[INC]], ptr [[C]], align 4 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_v_c_st16( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 2 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[C]], align 4 // LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 -// LEWIDTH-NEXT: store volatile i32 [[INC]], ptr [[TMP1]], align 4 +// LEWIDTH-NEXT: store volatile i32 [[INC]], ptr [[C]], align 4 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_v_c_st16( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 2 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[C]], align 4 // BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 -// BEWIDTH-NEXT: store volatile i32 [[INC]], ptr [[TMP1]], align 4 +// BEWIDTH-NEXT: store volatile i32 [[INC]], ptr [[C]], align 4 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_v_c_st16( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 2 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[C]], align 4 // LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 -// LEWIDTHNUM-NEXT: store volatile i32 [[INC]], ptr [[TMP1]], align 4 +// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[C]], align 4 +// LEWIDTHNUM-NEXT: store volatile i32 [[INC]], ptr [[C]], align 4 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_v_c_st16( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 2 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 2 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[C]], align 4 // BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 -// BEWIDTHNUM-NEXT: store volatile i32 [[INC]], ptr [[TMP1]], align 4 +// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[C]], align 4 +// BEWIDTHNUM-NEXT: store volatile i32 [[INC]], ptr [[C]], align 4 // BEWIDTHNUM-NEXT: ret void // void increment_v_c_st16(volatile struct st16 *s) { @@ -4070,144 +3554,110 @@ void increment_v_c_st16(volatile struct st16 *s) { // LE-LABEL: @increment_v_d_st16( // LE-NEXT: entry: -// LE-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[D]], align 4 -// LE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LE-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[D]], align 4 +// LE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[D]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LE-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LE-NEXT: store volatile i64 [[BF_SET]], ptr [[D]], align 4 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LE-NEXT: store volatile i16 [[TMP0]], ptr [[D]], align 4 +// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LE-NEXT: ret void // // BE-LABEL: @increment_v_d_st16( // BE-NEXT: entry: -// BE-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[D]], align 4 -// BE-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BE-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[D]], align 4 +// BE-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[D]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BE-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BE-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BE-NEXT: store volatile i64 [[BF_SET]], ptr [[D]], align 4 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BE-NEXT: store volatile i16 [[TMP0]], ptr [[D]], align 4 +// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_v_d_st16( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[D]], align 4 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 16 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// LENUMLOADS-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[D]], align 4 +// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[D]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// LENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -281470681743361 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// LENUMLOADS-NEXT: store volatile i64 [[BF_SET]], ptr [[D]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i16, ptr [[D]], align 4 +// LENUMLOADS-NEXT: store volatile i16 [[TMP0]], ptr [[D]], align 4 +// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_v_d_st16( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 1 -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i64, ptr [[D]], align 4 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i64 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i64 [[BF_SHL]], 48 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i64 [[BF_ASHR]] to i32 +// BENUMLOADS-NEXT: [[D:%.*]] = getelementptr inbounds [[STRUCT_ST16:%.*]], ptr [[S:%.*]], i32 0, i32 3 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[D]], align 4 +// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i16 [[BF_LOAD]] to i32 // BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i64 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i64, ptr [[D]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i64 [[TMP1]], 65535 -// BENUMLOADS-NEXT: [[BF_SHL2:%.*]] = shl i64 [[BF_VALUE]], 16 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i64 [[BF_LOAD1]], -4294901761 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i64 [[BF_CLEAR]], [[BF_SHL2]] -// BENUMLOADS-NEXT: store volatile i64 [[BF_SET]], ptr [[D]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i64 [[BF_VALUE]], 48 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i64 [[BF_RESULT_SHL]], 48 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i64 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i16 +// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i16, ptr [[D]], align 4 +// BENUMLOADS-NEXT: store volatile i16 [[TMP0]], ptr [[D]], align 4 +// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i16 [[TMP0]] to i32 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_v_d_st16( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 3 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 3 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 // LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 16 // LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_ASHR]], 1 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // LEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 // LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], -65536 // LEWIDTH-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTH-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP1]], align 4 +// LEWIDTH-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP0]], align 4 // LEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // LEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i32 [[BF_RESULT_SHL]], 16 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_v_d_st16( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 3 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 3 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_LOAD]], 16 // BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_ASHR]], 1 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // BEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 // BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], 65535 // BEWIDTH-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTH-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP1]], align 4 +// BEWIDTH-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP0]], align 4 // BEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // BEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i32 [[BF_RESULT_SHL]], 16 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_v_d_st16( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 3 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 3 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_LOAD]], 16 // LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_SHL]], 16 // LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_ASHR]], 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // LEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 // LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], -65536 // LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTHNUM-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP1]], align 4 +// LEWIDTHNUM-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP0]], align 4 // LEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // LEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i32 [[BF_RESULT_SHL]], 16 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_v_d_st16( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 3 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[S:%.*]], i32 3 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i32 [[BF_LOAD]], 16 // BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_ASHR]], 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP1]], align 4 +// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[TMP0]], align 4 // BEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 // BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], 65535 // BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTHNUM-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP1]], align 4 +// BEWIDTHNUM-NEXT: store volatile i32 [[BF_SET]], ptr [[TMP0]], align 4 // BEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i32 [[BF_VALUE]], 16 // BEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i32 [[BF_RESULT_SHL]], 16 // BEWIDTHNUM-NEXT: ret void @@ -4224,146 +3674,62 @@ char c : 8; // LE-LABEL: @increment_v_b_st17( // LE-NEXT: entry: -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LE-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 8 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 8 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LE-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -4294967296 -// LE-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// LE-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 1 +// LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LE-NEXT: store volatile i32 [[INC]], ptr [[S]], align 1 // LE-NEXT: ret void // // BE-LABEL: @increment_v_b_st17( // BE-NEXT: entry: -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 8 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BE-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// BE-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 255 -// BE-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// BE-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 1 +// BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BE-NEXT: store volatile i32 [[INC]], ptr [[S]], align 1 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_v_b_st17( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 8 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 8 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -4294967296 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// LENUMLOADS-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 1 +// LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[S]], align 1 +// LENUMLOADS-NEXT: store volatile i32 [[INC]], ptr [[S]], align 1 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_v_b_st17( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 8 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 255 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// BENUMLOADS-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 1 +// BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[S]], align 1 +// BENUMLOADS-NEXT: store volatile i32 [[INC]], ptr [[S]], align 1 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_v_b_st17( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 8 -// LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 8 -// LEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// LEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -4294967296 -// LEWIDTH-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTH-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 1 +// LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LEWIDTH-NEXT: store volatile i32 [[INC]], ptr [[S]], align 1 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_v_b_st17( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 8 -// BEWIDTH-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BEWIDTH-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// BEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BEWIDTH-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 255 -// BEWIDTH-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTH-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BEWIDTH-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BEWIDTH-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 1 +// BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BEWIDTH-NEXT: store volatile i32 [[INC]], ptr [[S]], align 1 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_v_b_st17( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 8 -// LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 8 -// LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// LEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -4294967296 -// LEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// LEWIDTHNUM-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// LEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 1 +// LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[S]], align 1 +// LEWIDTHNUM-NEXT: store volatile i32 [[INC]], ptr [[S]], align 1 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_v_b_st17( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 8 -// BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i32 -// BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = zext i32 [[INC]] to i40 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BEWIDTHNUM-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 4294967295 -// BEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BEWIDTHNUM-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 255 -// BEWIDTHNUM-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// BEWIDTHNUM-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BEWIDTHNUM-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 8 -// BEWIDTHNUM-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 8 -// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i32 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i32, ptr [[S:%.*]], align 1 +// BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_LOAD]], 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i32, ptr [[S]], align 1 +// BEWIDTHNUM-NEXT: store volatile i32 [[INC]], ptr [[S]], align 1 // BEWIDTHNUM-NEXT: ret void // void increment_v_b_st17(volatile struct st17 *s) { @@ -4372,108 +3738,70 @@ void increment_v_b_st17(volatile struct st17 *s) { // LE-LABEL: @increment_v_c_st17( // LE-NEXT: entry: -// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LE-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 32 -// LE-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i8 -// LE-NEXT: [[INC:%.*]] = add i8 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = zext i8 [[INC]] to i40 -// LE-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LE-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 255 -// LE-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 4294967295 -// LE-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// LE-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 32 -// LE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 32 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i8 +// LE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST17:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 1 +// LE-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 +// LE-NEXT: store volatile i8 [[INC]], ptr [[C]], align 1 // LE-NEXT: ret void // // BE-LABEL: @increment_v_c_st17( // BE-NEXT: entry: -// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BE-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 32 -// BE-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 32 -// BE-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i8 -// BE-NEXT: [[INC:%.*]] = add i8 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = zext i8 [[INC]] to i40 -// BE-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BE-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 255 -// BE-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -256 -// BE-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// BE-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BE-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 32 -// BE-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 32 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i8 +// BE-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST17:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 1 +// BE-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 +// BE-NEXT: store volatile i8 [[INC]], ptr [[C]], align 1 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_v_c_st17( // LENUMLOADS-NEXT: entry: -// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// LENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_LOAD]], 32 -// LENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i8 -// LENUMLOADS-NEXT: [[INC:%.*]] = add i8 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = zext i8 [[INC]] to i40 -// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 255 -// LENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], 4294967295 -// LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_SHL]] -// LENUMLOADS-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// LENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 32 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i8 +// LENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST17:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 1 +// LENUMLOADS-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 +// LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 +// LENUMLOADS-NEXT: store volatile i8 [[INC]], ptr [[C]], align 1 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_v_c_st17( // BENUMLOADS-NEXT: entry: -// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i40, ptr [[S:%.*]], align 1 -// BENUMLOADS-NEXT: [[BF_SHL:%.*]] = shl i40 [[BF_LOAD]], 32 -// BENUMLOADS-NEXT: [[BF_ASHR:%.*]] = ashr i40 [[BF_SHL]], 32 -// BENUMLOADS-NEXT: [[BF_CAST:%.*]] = trunc i40 [[BF_ASHR]] to i8 -// BENUMLOADS-NEXT: [[INC:%.*]] = add i8 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = zext i8 [[INC]] to i40 -// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i40, ptr [[S]], align 1 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i40 [[TMP1]], 255 -// BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i40 [[BF_LOAD1]], -256 -// BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i40 [[BF_CLEAR]], [[BF_VALUE]] -// BENUMLOADS-NEXT: store volatile i40 [[BF_SET]], ptr [[S]], align 1 -// BENUMLOADS-NEXT: [[BF_RESULT_SHL:%.*]] = shl i40 [[BF_VALUE]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_ASHR:%.*]] = ashr i40 [[BF_RESULT_SHL]], 32 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = trunc i40 [[BF_RESULT_ASHR]] to i8 +// BENUMLOADS-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST17:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 1 +// BENUMLOADS-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 +// BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 +// BENUMLOADS-NEXT: store volatile i8 [[INC]], ptr [[C]], align 1 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_v_c_st17( // LEWIDTH-NEXT: entry: -// LEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i32 4 -// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// LEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST17:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 1 // LEWIDTH-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 -// LEWIDTH-NEXT: store volatile i8 [[INC]], ptr [[TMP1]], align 1 +// LEWIDTH-NEXT: store volatile i8 [[INC]], ptr [[C]], align 1 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_v_c_st17( // BEWIDTH-NEXT: entry: -// BEWIDTH-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i32 4 -// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// BEWIDTH-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST17:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 1 // BEWIDTH-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 -// BEWIDTH-NEXT: store volatile i8 [[INC]], ptr [[TMP1]], align 1 +// BEWIDTH-NEXT: store volatile i8 [[INC]], ptr [[C]], align 1 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_v_c_st17( // LEWIDTHNUM-NEXT: entry: -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i32 4 -// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// LEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST17:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 1 // LEWIDTHNUM-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP1]], align 1 -// LEWIDTHNUM-NEXT: store volatile i8 [[INC]], ptr [[TMP1]], align 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 +// LEWIDTHNUM-NEXT: store volatile i8 [[INC]], ptr [[C]], align 1 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_v_c_st17( // BEWIDTHNUM-NEXT: entry: -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i32 4 -// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[TMP1]], align 1 +// BEWIDTHNUM-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT_ST17:%.*]], ptr [[S:%.*]], i32 0, i32 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[C]], align 1 // BEWIDTHNUM-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP1]], align 1 -// BEWIDTHNUM-NEXT: store volatile i8 [[INC]], ptr [[TMP1]], align 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[C]], align 1 +// BEWIDTHNUM-NEXT: store volatile i8 [[INC]], ptr [[C]], align 1 // BEWIDTHNUM-NEXT: ret void // void increment_v_c_st17(volatile struct st17 *s) { @@ -4493,9 +3821,9 @@ struct zero_bitfield { // LE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 4 // LE-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // LE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LE-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 -// LE-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 4 -// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// LE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 +// LE-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 4 +// LE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // LE-NEXT: ret void // // BE-LABEL: @increment_a_zero_bitfield( @@ -4503,9 +3831,9 @@ struct zero_bitfield { // BE-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 4 // BE-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // BE-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BE-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 -// BE-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 4 -// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// BE-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 +// BE-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 4 +// BE-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // BE-NEXT: ret void // // LENUMLOADS-LABEL: @increment_a_zero_bitfield( @@ -4513,10 +3841,10 @@ struct zero_bitfield { // LENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 4 // LENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // LENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LENUMLOADS-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 // LENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[S]], align 4 -// LENUMLOADS-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// LENUMLOADS-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 4 +// LENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // LENUMLOADS-NEXT: ret void // // BENUMLOADS-LABEL: @increment_a_zero_bitfield( @@ -4524,10 +3852,10 @@ struct zero_bitfield { // BENUMLOADS-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 4 // BENUMLOADS-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // BENUMLOADS-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BENUMLOADS-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 // BENUMLOADS-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[S]], align 4 -// BENUMLOADS-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// BENUMLOADS-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 4 +// BENUMLOADS-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // BENUMLOADS-NEXT: ret void // // LEWIDTH-LABEL: @increment_a_zero_bitfield( @@ -4535,9 +3863,9 @@ struct zero_bitfield { // LEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 4 // LEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // LEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTH-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 -// LEWIDTH-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 4 -// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// LEWIDTH-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 +// LEWIDTH-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 4 +// LEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_a_zero_bitfield( @@ -4545,9 +3873,9 @@ struct zero_bitfield { // BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 4 // BEWIDTH-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // BEWIDTH-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTH-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 -// BEWIDTH-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 4 -// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// BEWIDTH-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 +// BEWIDTH-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 4 +// BEWIDTH-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_a_zero_bitfield( @@ -4555,10 +3883,10 @@ struct zero_bitfield { // LEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 4 // LEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // LEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// LEWIDTHNUM-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 // LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[S]], align 4 -// LEWIDTHNUM-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 4 -// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// LEWIDTHNUM-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 4 +// LEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_a_zero_bitfield( @@ -4566,10 +3894,10 @@ struct zero_bitfield { // BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i8, ptr [[S:%.*]], align 4 // BEWIDTHNUM-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_LOAD]] to i32 // BEWIDTHNUM-NEXT: [[INC:%.*]] = add nsw i32 [[BF_CAST]], 1 -// BEWIDTHNUM-NEXT: [[TMP1:%.*]] = trunc i32 [[INC]] to i8 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = trunc i32 [[INC]] to i8 // BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[S]], align 4 -// BEWIDTHNUM-NEXT: store volatile i8 [[TMP1]], ptr [[S]], align 4 -// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP1]] to i32 +// BEWIDTHNUM-NEXT: store volatile i8 [[TMP0]], ptr [[S]], align 4 +// BEWIDTHNUM-NEXT: [[BF_RESULT_CAST:%.*]] = sext i8 [[TMP0]] to i32 // BEWIDTHNUM-NEXT: ret void // void increment_a_zero_bitfield(volatile struct zero_bitfield *s) { @@ -4692,9 +4020,9 @@ struct zero_bitfield_ok { // LE-NEXT: [[CONV3:%.*]] = sext i8 [[BF_CAST]] to i32 // LE-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV3]], [[CONV]] // LE-NEXT: [[CONV4:%.*]] = trunc i32 [[ADD]] to i8 -// LE-NEXT: [[TMP2:%.*]] = zext i8 [[CONV4]] to i16 +// LE-NEXT: [[TMP0:%.*]] = zext i8 [[CONV4]] to i16 // LE-NEXT: [[BF_LOAD5:%.*]] = load volatile i16, ptr [[S]], align 4 -// LE-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP2]], 255 +// LE-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP0]], 255 // LE-NEXT: [[BF_SHL6:%.*]] = shl i16 [[BF_VALUE]], 8 // LE-NEXT: [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD5]], 255 // LE-NEXT: [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_SHL6]] @@ -4716,9 +4044,9 @@ struct zero_bitfield_ok { // BE-NEXT: [[CONV3:%.*]] = sext i8 [[BF_CAST]] to i32 // BE-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV3]], [[CONV]] // BE-NEXT: [[CONV4:%.*]] = trunc i32 [[ADD]] to i8 -// BE-NEXT: [[TMP2:%.*]] = zext i8 [[CONV4]] to i16 +// BE-NEXT: [[TMP0:%.*]] = zext i8 [[CONV4]] to i16 // BE-NEXT: [[BF_LOAD5:%.*]] = load volatile i16, ptr [[S]], align 4 -// BE-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP2]], 255 +// BE-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP0]], 255 // BE-NEXT: [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD5]], -256 // BE-NEXT: [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_VALUE]] // BE-NEXT: store volatile i16 [[BF_SET]], ptr [[S]], align 4 @@ -4739,9 +4067,9 @@ struct zero_bitfield_ok { // LENUMLOADS-NEXT: [[CONV3:%.*]] = sext i8 [[BF_CAST]] to i32 // LENUMLOADS-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV3]], [[CONV]] // LENUMLOADS-NEXT: [[CONV4:%.*]] = trunc i32 [[ADD]] to i8 -// LENUMLOADS-NEXT: [[TMP2:%.*]] = zext i8 [[CONV4]] to i16 +// LENUMLOADS-NEXT: [[TMP0:%.*]] = zext i8 [[CONV4]] to i16 // LENUMLOADS-NEXT: [[BF_LOAD5:%.*]] = load volatile i16, ptr [[S]], align 4 -// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP2]], 255 +// LENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP0]], 255 // LENUMLOADS-NEXT: [[BF_SHL6:%.*]] = shl i16 [[BF_VALUE]], 8 // LENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD5]], 255 // LENUMLOADS-NEXT: [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_SHL6]] @@ -4763,9 +4091,9 @@ struct zero_bitfield_ok { // BENUMLOADS-NEXT: [[CONV3:%.*]] = sext i8 [[BF_CAST]] to i32 // BENUMLOADS-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV3]], [[CONV]] // BENUMLOADS-NEXT: [[CONV4:%.*]] = trunc i32 [[ADD]] to i8 -// BENUMLOADS-NEXT: [[TMP2:%.*]] = zext i8 [[CONV4]] to i16 +// BENUMLOADS-NEXT: [[TMP0:%.*]] = zext i8 [[CONV4]] to i16 // BENUMLOADS-NEXT: [[BF_LOAD5:%.*]] = load volatile i16, ptr [[S]], align 4 -// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP2]], 255 +// BENUMLOADS-NEXT: [[BF_VALUE:%.*]] = and i16 [[TMP0]], 255 // BENUMLOADS-NEXT: [[BF_CLEAR:%.*]] = and i16 [[BF_LOAD5]], -256 // BENUMLOADS-NEXT: [[BF_SET:%.*]] = or i16 [[BF_CLEAR]], [[BF_VALUE]] // BENUMLOADS-NEXT: store volatile i16 [[BF_SET]], ptr [[S]], align 4 @@ -4780,12 +4108,12 @@ struct zero_bitfield_ok { // LEWIDTH-NEXT: [[BF_SHL:%.*]] = shl i16 [[BF_LOAD]], 8 // LEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 8 // LEWIDTH-NEXT: [[CONV:%.*]] = sext i16 [[BF_ASHR]] to i32 -// LEWIDTH-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 1 -// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP2]], align 1 +// LEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 1 +// LEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // LEWIDTH-NEXT: [[CONV2:%.*]] = sext i8 [[BF_LOAD1]] to i32 // LEWIDTH-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV2]], [[CONV]] // LEWIDTH-NEXT: [[CONV3:%.*]] = trunc i32 [[ADD]] to i8 -// LEWIDTH-NEXT: store volatile i8 [[CONV3]], ptr [[TMP2]], align 1 +// LEWIDTH-NEXT: store volatile i8 [[CONV3]], ptr [[TMP0]], align 1 // LEWIDTH-NEXT: ret void // // BEWIDTH-LABEL: @increment_a_zero_bitfield_ok( @@ -4793,12 +4121,12 @@ struct zero_bitfield_ok { // BEWIDTH-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[S:%.*]], align 4 // BEWIDTH-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_LOAD]], 8 // BEWIDTH-NEXT: [[CONV:%.*]] = sext i16 [[BF_ASHR]] to i32 -// BEWIDTH-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 1 -// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP2]], align 1 +// BEWIDTH-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 1 +// BEWIDTH-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // BEWIDTH-NEXT: [[CONV2:%.*]] = sext i8 [[BF_LOAD1]] to i32 // BEWIDTH-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV2]], [[CONV]] // BEWIDTH-NEXT: [[CONV3:%.*]] = trunc i32 [[ADD]] to i8 -// BEWIDTH-NEXT: store volatile i8 [[CONV3]], ptr [[TMP2]], align 1 +// BEWIDTH-NEXT: store volatile i8 [[CONV3]], ptr [[TMP0]], align 1 // BEWIDTH-NEXT: ret void // // LEWIDTHNUM-LABEL: @increment_a_zero_bitfield_ok( @@ -4807,13 +4135,13 @@ struct zero_bitfield_ok { // LEWIDTHNUM-NEXT: [[BF_SHL:%.*]] = shl i16 [[BF_LOAD]], 8 // LEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_SHL]], 8 // LEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i16 [[BF_ASHR]] to i32 -// LEWIDTHNUM-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 1 -// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP2]], align 1 +// LEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // LEWIDTHNUM-NEXT: [[CONV2:%.*]] = sext i8 [[BF_LOAD1]] to i32 // LEWIDTHNUM-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV2]], [[CONV]] // LEWIDTHNUM-NEXT: [[CONV3:%.*]] = trunc i32 [[ADD]] to i8 -// LEWIDTHNUM-NEXT: [[BF_LOAD4:%.*]] = load volatile i8, ptr [[TMP2]], align 1 -// LEWIDTHNUM-NEXT: store volatile i8 [[CONV3]], ptr [[TMP2]], align 1 +// LEWIDTHNUM-NEXT: [[BF_LOAD4:%.*]] = load volatile i8, ptr [[TMP0]], align 1 +// LEWIDTHNUM-NEXT: store volatile i8 [[CONV3]], ptr [[TMP0]], align 1 // LEWIDTHNUM-NEXT: ret void // // BEWIDTHNUM-LABEL: @increment_a_zero_bitfield_ok( @@ -4821,13 +4149,13 @@ struct zero_bitfield_ok { // BEWIDTHNUM-NEXT: [[BF_LOAD:%.*]] = load volatile i16, ptr [[S:%.*]], align 4 // BEWIDTHNUM-NEXT: [[BF_ASHR:%.*]] = ashr i16 [[BF_LOAD]], 8 // BEWIDTHNUM-NEXT: [[CONV:%.*]] = sext i16 [[BF_ASHR]] to i32 -// BEWIDTHNUM-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 1 -// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP2]], align 1 +// BEWIDTHNUM-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD1:%.*]] = load volatile i8, ptr [[TMP0]], align 1 // BEWIDTHNUM-NEXT: [[CONV2:%.*]] = sext i8 [[BF_LOAD1]] to i32 // BEWIDTHNUM-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV2]], [[CONV]] // BEWIDTHNUM-NEXT: [[CONV3:%.*]] = trunc i32 [[ADD]] to i8 -// BEWIDTHNUM-NEXT: [[BF_LOAD4:%.*]] = load volatile i8, ptr [[TMP2]], align 1 -// BEWIDTHNUM-NEXT: store volatile i8 [[CONV3]], ptr [[TMP2]], align 1 +// BEWIDTHNUM-NEXT: [[BF_LOAD4:%.*]] = load volatile i8, ptr [[TMP0]], align 1 +// BEWIDTHNUM-NEXT: store volatile i8 [[CONV3]], ptr [[TMP0]], align 1 // BEWIDTHNUM-NEXT: ret void // void increment_a_zero_bitfield_ok(volatile struct zero_bitfield_ok *s) { diff --git a/clang/test/CodeGen/arm-bitfield-alignment.c b/clang/test/CodeGen/arm-bitfield-alignment.c index e34789face558..5d0967ec70346 100644 --- a/clang/test/CodeGen/arm-bitfield-alignment.c +++ b/clang/test/CodeGen/arm-bitfield-alignment.c @@ -1,5 +1,7 @@ -// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple arm-none-eabi -fdump-record-layouts-simple -ffreestanding -emit-llvm -o %t %s | FileCheck %s -check-prefixes=LAYOUT,LAYOUT-32 +// RUN: FileCheck %s -check-prefixes=IR,IR-32 <%t +// RUN: %clang_cc1 -triple aarch64 -fdump-record-layouts-simple -ffreestanding -emit-llvm -o %t %s | FileCheck %s -check-prefixes=LAYOUT,LAYOUT-64 +// RUN: FileCheck %s -check-prefixes=IR,IR-64 <%t extern struct T { int b0 : 8; @@ -11,5 +13,18 @@ int func(void) { return g.b1; } -// CHECK: @g = external global %struct.T, align 4 -// CHECK: %{{.*}} = load i64, ptr @g, align 4 +// IR: @g = external global %struct.T, align 4 +// IR-32: %{{.*}} = load i32, ptr @g, align 4 +// IR-64: %{{.*}} = load i64, ptr @g, align 4 + +// LAYOUT-LABEL: LLVMType:%struct.T = +// LAYOUT-32-SAME: type { i32, i8 } +// LAYOUT-64-SAME: type { i64 } +// LAYOUT: BitFields:[ +// LAYOUT-32-NEXT: diff --git a/clang/test/CodeGen/arm64-be-bitfield.c b/clang/test/CodeGen/arm64-be-bitfield.c index 58c3185392984..57e20b5b62b9c 100644 --- a/clang/test/CodeGen/arm64-be-bitfield.c +++ b/clang/test/CodeGen/arm64-be-bitfield.c @@ -1,11 +1,25 @@ -// RUN: %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -emit-llvm -O0 -o - %s | FileCheck --check-prefix IR %s +// RUN: %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -emit-llvm -O0 -o %t -fdump-record-layouts-simple %s | FileCheck %s --check-prefix=LAYOUT +// RUN: FileCheck %s --check-prefix=IR <%t struct bt3 { signed b2:10; signed b3:10; } b16; // Get the high 32-bits and then shift appropriately for big-endian. signed callee_b0f(struct bt3 bp11) { // IR: callee_b0f(i64 [[ARG:%.*]]) -// IR: store i64 [[ARG]], ptr [[PTR:%.*]], align 8 -// IR: call void @llvm.memcpy.p0.p0.i64(ptr {{.*}}, ptr align 8 [[PTR]], i64 4 +// IR: [[BP11:%.*]] = alloca %struct.bt3, align 4 +// IR: [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.bt3, ptr [[BP11]], i32 0, i32 0 +// IR: [[COERCE_HIGHBITS:%.*]] = lshr i64 [[ARG]], 32 +// IR: [[COERCE_VAL_II:%.*]] = trunc i64 [[COERCE_HIGHBITS]] to i32 +// IR: store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4 +// IR: [[BF_LOAD:%.*]] = load i32, ptr [[BP11]], align 4 +// IR: [[BF_ASHR:%.*]] = ashr i32 [[BF_LOAD]], 22 +// IR: ret i32 [[BF_ASHR]] return bp11.b2; } + +// LAYOUT-LABEL: LLVMType:%struct.bt3 = +// LAYOUT-SAME: type { i32 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: diff --git a/clang/test/CodeGen/bitfield-2.c b/clang/test/CodeGen/bitfield-2.c index 3e0b30c7a17d8..8688ba6390ddb 100644 --- a/clang/test/CodeGen/bitfield-2.c +++ b/clang/test/CodeGen/bitfield-2.c @@ -271,11 +271,11 @@ _Bool test_6(void) { // CHECK-RECORD: *** Dumping IRgen Record Layout // CHECK-RECORD: Record: RecordDecl{{.*}}s7 // CHECK-RECORD: Layout: // CHECK-RECORD: IsZeroInitializable:1 // CHECK-RECORD: BitFields:[ -// CHECK-RECORD: + +// This will often be align(1) with -fno-bitfield-type-align +struct P2 { + unsigned a :8; + char :0; + short :0; + unsigned b :8; +} p2; +// CHECK-LABEL: LLVMType:%struct.P2 = +// LAYOUT-T-SAME: type { i8, i8, i8, i8 } +// LAYOUT-ARM64-T-SAME: type { i8, i8, i8, i8 } +// LAYOUT-NT-SAME: type { i8, i8 } +// LAYOUT-STRICT-NT-SAME: type { i8, i8 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct P3 { + unsigned a :8; + char :0; + short :0; + unsigned :0; + unsigned b :8; +} p3; +// CHECK-LABEL: LLVMType:%struct.P3 = +// LAYOUT-T-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// LAYOUT-ARM64-T-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// LAYOUT-NT-SAME: type { i8, i8 } +// LAYOUT-STRICT-NT-SAME: type { i8, i8 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct P4 { + unsigned a :8; + short :0; + unsigned :0; + unsigned b :8; +} p4; +// CHECK-LABEL: LLVMType:%struct.P4 = +// LAYOUT-T-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// LAYOUT-ARM64-T-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// LAYOUT-NT-SAME: type { i8, i8 } +// LAYOUT-STRICT-NT-SAME: type { i8, i8 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct P5 { + unsigned a :8; + unsigned :0; + unsigned b :8; +} p5; +// CHECK-LABEL: LLVMType:%struct.P5 = +// LAYOUT-T-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// LAYOUT-ARM64-T-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// LAYOUT-NT-SAME: type { i8, i8 } +// LAYOUT-STRICT-NT-SAME: type { i8, i8 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct P6 { + unsigned a :8; + unsigned :0; + short :0; + char :0; + unsigned b :8; +} p6; +// CHECK-LABEL: LLVMType:%struct.P6 = +// LAYOUT-T-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// LAYOUT-ARM64-T-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// LAYOUT-NT-SAME: type { i8, i8 } +// LAYOUT-STRICT-NT-SAME: type { i8, i8 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct P7 { + unsigned a : 8; + short : 0; + unsigned char b : 8; +} p7; +// CHECK-LABEL: LLVMType:%struct.P7 = +// LAYOUT-T-SAME: type { i8, i8, i8, i8 } +// LAYOUT-ARM64-T-SAME: type { i8, i8, i8, i8 } +// LAYOUT-NT-SAME: type { i8, i8 } +// LAYOUT-STRICT-NT-SAME: type { i8, i8 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +// And with forced alignment for !useZeroLengthBitfieldAlignment machines (eg +// hexagon) +struct __attribute__ ((aligned (2))) P7_align { + unsigned a : 8; + short : 0; + unsigned char b : 8; +} p7_align; +// CHECK-LABEL: LLVMType:%struct.P7_align = +// LAYOUT-T-SAME: type { i8, i8, i8, i8 } +// LAYOUT-ARM64-T-SAME: type { i8, i8, i8, i8 } +// LAYOUT-NT-SAME: type { i8, i8 } +// LAYOUT-STRICT-NT-SAME: type { i8, i8 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct P8 { + unsigned a : 7; + short : 0; + unsigned char b : 7; +} p8; +// CHECK-LABEL: LLVMType:%struct.P8 = +// LAYOUT-T-SAME: type { i8, i8, i8, i8 } +// LAYOUT-ARM64-T-SAME: type { i8, i8, i8, i8 } +// LAYOUT-NT-SAME: type { i16 } +// LAYOUT-STRICT-NT-SAME: type { i16 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct P9 { + unsigned a : 7; + char : 0; + unsigned short b : 7; +} p9; +// CHECK-LABEL: LLVMType:%struct.P9 = +// LAYOUT-T-SAME: type { i8, i8, [2 x i8] } +// LAYOUT-ARM64-T-SAME: type { i8, i8 } +// LAYOUT-NT-SAME: type { i16 } +// LAYOUT-STRICT-NT-SAME: type { i16 } +// LAYOUT-DWN32-SAME: type { i8, [3 x i8], i8, [3 x i8] } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct __attribute__((aligned(4))) P10 { + unsigned a : 7; + unsigned short b : 7; + unsigned c : 7; + char : 0; +} p10; +// CHECK-LABEL: LLVMType:%struct.P10 = +// LAYOUT-T-SAME: type { i32 } +// LAYOUT-ARM64-T-SAME: type { i32 } +// LAYOUT-NT-SAME: type { i32 } +// LAYOUT-STRICT-NT-SAME: type { i32 } +// LAYOUT-DWN32-SAME: type { i32 } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: + +struct __attribute__((aligned(4))) P11 { + unsigned a : 7; + unsigned short b : 7; + unsigned c : 10; + char : 0; // at a char boundary +} p11; +// CHECK-LABEL: LLVMType:%struct.P11 = +// LAYOUT-T-SAME: type { i32 } +// LAYOUT-ARM64-T-SAME: type { i32 } +// LAYOUT-NT-SAME: type { i32 } +// LAYOUT-STRICT-NT-SAME: type { i32 } +// LAYOUT-DWN32-SAME: type { i32 } +// CHECK: BitFields:[ +// LAYOUT-T-NEXT: diff --git a/clang/test/CodeGen/bitfield-access-unit.c b/clang/test/CodeGen/bitfield-access-unit.c new file mode 100644 index 0000000000000..1aed2e7202fc6 --- /dev/null +++ b/clang/test/CodeGen/bitfield-access-unit.c @@ -0,0 +1,302 @@ +// Check arches with 32bit ints. (Not you, AVR & MSP430) + +// Configs that have cheap unaligned access + +// 64-bit Little Endian +// RUN: %clang_cc1 -triple=aarch64-apple-darwin %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX64,CHECK-64,LAYOUT-64-DWN %s +// RUN: %clang_cc1 -triple=aarch64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX64,CHECK-64,LAYOUT-64,LAYOUT-64-FLEX %s +// RUN: %clang_cc1 -triple=loongarch64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX64,CHECK-64,LAYOUT-64,LAYOUT-64-FLEX %s +// RUN: %clang_cc1 -triple=ve-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX64 %s +// RUN: %clang_cc1 -triple=wasm64 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX64 %s +// RUN: %clang_cc1 -triple=x86_64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX64,CHECK-64,LAYOUT-64,LAYOUT-64-FLEX %s + +// 64-bit Big Endian +// RUN: %clang_cc1 -triple=powerpc64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX64,CHECK-64,LAYOUT-64,LAYOUT-64-FLEX %s +// RUN: %clang_cc1 -triple=systemz %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX64,CHECK-64,LAYOUT-64,LAYOUT-64-FLEX %s + +// 32-bit Little Endian +// RUN: %clang_cc1 -triple=arm-apple-darwin %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT-DWN32,LAYOUT-DWN32-FLEX %s +// RUN: %clang_cc1 -triple=arm-none-eabi %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX32 %s +// RUN: %clang_cc1 -triple=i686-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX32 %s +// RUN: %clang_cc1 -triple=powerpcle-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX32 %s +// RUN: %clang_cc1 -triple=wasm32 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX32 %s + +// 32-bit Big Endian +// RUN: %clang_cc1 -triple=powerpc-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-FLEX,LAYOUT-FLEX32 %s + +// Configs that have expensive unaligned access +// 64-bit Little Endian +// RUN: %clang_cc1 -triple=aarch64-linux-gnu %s -target-feature +strict-align -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT,CHECK-64,LAYOUT-64,LAYOUT-64-STRICT %s +// RUN: %clang_cc1 -triple=amdgcn-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT,CHECK-64,LAYOUT-64,LAYOUT-64-STRICT %s +// RUN: %clang_cc1 -triple=loongarch64-elf -target-feature -ual %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT,CHECK-64,LAYOUT-64,LAYOUT-64-STRICT %s +// RUN: %clang_cc1 -triple=riscv64 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT,CHECK-64,LAYOUT-64,LAYOUT-64-STRICT %s + +// 64-big Big endian +// RUN: %clang_cc1 -triple=mips64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT,CHECK-64,LAYOUT-64,LAYOUT-64-STRICT %s + +// 32-bit Little Endian +// RUN: %clang_cc1 -triple=arc-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=arm-apple-darwin %s -target-feature +strict-align -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT-DWN32,LAYOUT-DWN32-STRICT %s +// RUN: %clang_cc1 -triple=arm-none-eabi %s -target-feature +strict-align -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=bpf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=csky %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=hexagon-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=loongarch32-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=nvptx-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=riscv32 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=spir-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=xcore-none-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s + +// 32-bit Big Endian +// RUN: %clang_cc1 -triple=lanai-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=mips-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=sparc-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s +// RUN: %clang_cc1 -triple=tce-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT,LAYOUT-STRICT %s + +// Both le64-elf and m68-elf are strict alignment ISAs with 4-byte aligned +// 64-bit or 2-byte aligned 32-bit integer types. This more compex to describe here. + +// If unaligned access is expensive don't stick these together. +struct A { + char a : 7; + char b : 7; +} a; +// CHECK-LABEL: LLVMType:%struct.A = +// LAYOUT-FLEX-SAME: type { i16 } +// LAYOUT-STRICT-SAME: type { i8, i8 } +// LAYOUT-DWN32-SAME: type { i16 } +// CHECK: BitFields:[ +// LAYOUT-FLEX-NEXT: + +// But do here. +struct __attribute__((aligned(2))) B { + char a : 7; + char b : 7; +} b; +// CHECK-LABEL: LLVMType:%struct.B = +// LAYOUT-SAME: type { i16 } +// LAYOUT-DWN32-SAME: type { i16 } +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +// Not here -- poor alignment within struct +struct C { + int f1; + char f2; + char a : 7; + char b : 7; +} c; +// CHECK-LABEL: LLVMType:%struct.C = +// LAYOUT-FLEX-SAME: type <{ i32, i8, i16, i8 }> +// LAYOUT-STRICT-SAME: type { i32, i8, i8, i8 } +// LAYOUT-DWN32-SAME: type <{ i32, i8, i16, i8 }> +// CHECK: BitFields:[ +// LAYOUT-FLEX-NEXT: + +// Not here, we're packed +struct __attribute__((packed)) D { + int f1; + int a : 8; + int b : 8; + char _; +} d; +// CHECK-LABEL: LLVMType:%struct.D = +// LAYOUT-FLEX-SAME: type <{ i32, i16, i8 }> +// LAYOUT-STRICT-SAME: type <{ i32, i8, i8, i8 }> +// LAYOUT-DWN32-FLEX-SAME: type <{ i32, i16, i8 }> +// LAYOUT-DWN32-STRICT-SAME: type <{ i32, i8, i8, i8 }> +// CHECK: BitFields:[ +// LAYOUT-FLEX-NEXT: + +struct E { + char a : 7; + short b : 13; + unsigned c : 12; +} e; +// CHECK-LABEL: LLVMType:%struct.E = +// LAYOUT-FLEX64-SAME: type { i64 } +// LAYOUT-FLEX32-SAME: type { i32, i16 } +// LAYOUT-STRICT-SAME: type { i32, i16 } +// LAYOUT-DWN32-SAME: type { i32 } +// CHECK: BitFields:[ +// LAYOUT-FLEX64-NEXT: + +struct F { + char a : 7; + short b : 13; + unsigned c : 12; + signed char d : 7; +} f; +// CHECK-LABEL: LLVMType:%struct.F = +// LAYOUT-FLEX64-SAME: type { i64 } +// LAYOUT-FLEX32-SAME: type { i32, i32 } +// LAYOUT-STRICT-SAME: type { i32, i32 } +// LAYOUT-DWN32-SAME: type <{ i32, i8 }> +// CHECK: BitFields:[ +// LAYOUT-FLEX64-NEXT: + +struct G { + char a : 7; + short b : 13; + unsigned c : 12; + signed char d : 7; + signed char e; +} g; +// CHECK-LABEL: LLVMType:%struct.G = +// LAYOUT-SAME: type { i32, i16, i8, i8 } +// LAYOUT-DWN32-SAME: type <{ i32, i8, i8 }> +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +#if _LP64 +struct A64 { + int a : 16; + short b : 8; + long c : 16; + int d : 16; + signed char e : 8; +} a64; +// CHECK-64-LABEL: LLVMType:%struct.A64 = +// LAYOUT-64-SAME: type { i64 } +// LAYOUT-64-DWN-SAME: type { i64 } +// CHECK-64: BitFields:[ +// LAYOUT-64-NEXT: + +struct B64 { + int a : 16; + short b : 8; + long c : 16; + int d : 16; + signed char e; // not a bitfield +} b64; +// CHECK-64-LABEL: LLVMType:%struct.B64 = +// LAYOUT-64-FLEX-SAME: type <{ i16, i8, i32, i8 }> +// LAYOUT-64-STRICT-SAME: type <{ i16, i8, i16, i16, i8 }> +// LAYOUT-64-DWN-SAME: type <{ i16, i8, i32, i8 }> +// CHECK-64: BitFields:[ +// LAYOUT-64-FLEX-NEXT: + +struct C64 { + int a : 15; + short b : 8; + long c : 16; + int d : 15; + signed char e : 7; +} c64; +// CHECK-64-LABEL: LLVMType:%struct.C64 = +// LAYOUT-64-SAME: type { i64 } +// LAYOUT-64-DWN-SAME: type { i64 } +// CHECK-64: BitFields:[ +// LAYOUT-64-NEXT: + +#endif diff --git a/clang/test/CodeGen/debug-info-bitfield-0-struct.c b/clang/test/CodeGen/debug-info-bitfield-0-struct.c index 0535b62677142..9fadf898e3466 100644 --- a/clang/test/CodeGen/debug-info-bitfield-0-struct.c +++ b/clang/test/CodeGen/debug-info-bitfield-0-struct.c @@ -101,8 +101,10 @@ struct None_B { int y : 4; }; -struct None_C { - // BOTH-DAG: ![[NONE_C:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None_C", file: !{{[0-9]+}}, line: {{[0-9]+}}, size: 32, elements: ![[NONE_C_ELEMENTS:[0-9]+]]) +// AMDGCN does not do unaligned access cheaply, so the bitfield access units +// would remain single bytes, without the aligned attribure +struct __attribute__((aligned(4))) None_C { + // BOTH-DAG: ![[NONE_C:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "None_C", file: !{{[0-9]+}}, line: {{[0-9]+}}, size: 32, align: 32, elements: ![[NONE_C_ELEMENTS:[0-9]+]]) // BOTH-DAG: ![[NONE_C_ELEMENTS]] = !{![[NONE_C_X:[0-9]+]], ![[NONE_C_Y:[0-9]+]], ![[NONE_C_A:[0-9]+]], ![[NONE_C_B:[0-9]+]]} // BOTH-DAG: ![[NONE_C_X]] = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: ![[NONE_C]], file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: !{{[0-9]+}}, size: 8, flags: DIFlagBitField, extraData: i64 0) // BOTH-DAG: ![[NONE_C_Y]] = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: ![[NONE_C]], file: !{{[0-9]+}}, line: {{[0-9]+}}, baseType: !{{[0-9]+}}, size: 8, offset: 8, flags: DIFlagBitField, extraData: i64 0) diff --git a/clang/test/CodeGen/no-bitfield-type-align.c b/clang/test/CodeGen/no-bitfield-type-align.c index 53ed5e9ad8f85..1861c6886a35b 100644 --- a/clang/test/CodeGen/no-bitfield-type-align.c +++ b/clang/test/CodeGen/no-bitfield-type-align.c @@ -1,4 +1,5 @@ -// RUN: %clang_cc1 -triple x86_64-apple-darwin -fno-bitfield-type-align -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -fno-bitfield-type-align -fdump-record-layouts-simple -emit-llvm -o %t %s | FileCheck %s -check-prefix=LAYOUT +// RUN: FileCheck %s <%t struct S { unsigned short: 0; @@ -7,6 +8,13 @@ struct S { unsigned short f2:15; }; +// LAYOUT-LABEL: LLVMType:%struct.S = +// LAYOUT-SAME: type { i32 } +// LAYOUT: BitFields:[ +// LAYOUT-NEXT: + // CHECK: define{{.*}} void @test_zero_width_bitfield(ptr noundef %[[A:.*]]) // CHECK: %[[BF_LOAD:.*]] = load i32, ptr %[[V1:.*]], align 1 // CHECK: %[[BF_CLEAR:.*]] = and i32 %[[BF_LOAD]], 32767 diff --git a/clang/test/CodeGen/struct-x86-darwin.c b/clang/test/CodeGen/struct-x86-darwin.c index 5191441cabaf0..e79ecefb880df 100644 --- a/clang/test/CodeGen/struct-x86-darwin.c +++ b/clang/test/CodeGen/struct-x86-darwin.c @@ -1,25 +1,70 @@ -// RUN: %clang_cc1 %s -emit-llvm -triple=i686-apple-darwin9 -o - | FileCheck %s -// CHECK: STest1 = type { i32, [4 x i16], double } -// CHECK: STest2 = type { i16, i16, i32, i32 } -// CHECK: STest3 = type { i8, i16, i32 } -// CHECK: STestB1 = type { i8, i8 } -// CHECK: STestB2 = type { i8, i8, i8 } -// CHECK: STestB3 = type { i8, i8 } -// CHECK: STestB4 = type { i8, i8, i8, i8 } -// CHECK: STestB5 = type { i8, i16, i8 } -// CHECK: STestB6 = type { i8, i8, i16 } +// RUN: %clang_cc1 %s -emit-llvm -o /dev/null -triple=i686-apple-darwin9 -fdump-record-layouts-simple | FileCheck %s + // Test struct layout for x86-darwin target struct STest1 {int x; short y[4]; double z; } st1; struct STest2 {short a,b; int c,d; } st2; struct STest3 {char a; short b; int c; } st3; -// Bitfields +// Bitfields struct STestB1 {char a; char b:2; } stb1; struct STestB2 {char a; char b:5; char c:4; } stb2; struct STestB3 {char a; char b:2; } stb3; struct STestB4 {char a; short b:2; char c; } stb4; struct STestB5 {char a; short b:10; char c; } stb5; -struct STestB6 {int a:1; char b; int c:13 } stb6; +struct STestB6 {int a:1; char b; int c:13; } stb6; // Packed struct STestP1 {char a; short b; int c; } __attribute__((__packed__)) stp1; + +// CHECK-LABEL: LLVMType:%struct.STest1 = +// CHECK-SAME: type { i32, [4 x i16], double } +// CHECK: BitFields:[ +// CHECK-NEXT: ]> + +// CHECK-LABEL: LLVMType:%struct.STest2 = +// CHECK-SAME: type { i16, i16, i32, i32 } +// CHECK: BitFields:[ +// CHECK-NEXT: ]> + +// CHECK-LABEL: LLVMType:%struct.STest3 = +// CHECK-SAME: type { i8, i16, i32 } +// CHECK: BitFields:[ +// CHECK-NEXT: ]> + +// CHECK-LABEL: LLVMType:%struct.STestB1 = +// CHECK-SAME: type { i8, i8 } +// CHECK: BitFields:[ +// CHECK-NEXT: + +// CHECK-LABEL: LLVMType:%struct.STestB2 = +// CHECK-SAME: type <{ i8, i16 }> +// CHECK: BitFields:[ +// CHECK-NEXT: + +// CHECK-LABEL: LLVMType:%struct.STestB3 = +// CHECK-SAME: type { i8, i8 } +// CHECK: BitFields:[ +// CHECK-NEXT: + +// CHECK-LABEL: LLVMType:%struct.STestB4 = +// CHECK-SAME: type { i8, i8, i8, i8 } +// CHECK: BitFields:[ +// CHECK-NEXT: + +// CHECK-LABEL: LLVMType:%struct.STestB5 = +// CHECK-SAME: type { i8, i16, i8 } +// CHECK: BitFields:[ +// CHECK-NEXT: + +// CHECK-LABEL: LLVMType:%struct.STestB6 = +// CHECK-SAME: type { i8, i8, i16 } +// CHECK: BitFields:[ +// CHECK-NEXT: diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp index 9b4b7415142d9..ca076ce5aa273 100644 --- a/clang/test/CodeGen/tbaa-struct.cpp +++ b/clang/test/CodeGen/tbaa-struct.cpp @@ -197,7 +197,7 @@ void copy12(UnionMember2 *a1, UnionMember2 *a2) { // CHECK-OLD: [[TS6]] = !{i64 0, i64 2, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]} // CHECK-OLD: [[TAG_DOUBLE]] = !{[[DOUBLE:!.+]], [[DOUBLE]], i64 0} // CHECK-OLD [[DOUBLE]] = !{!"double", [[CHAR]], i64 0} -// CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 1, [[TAG_CHAR]]} +// CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 2, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 1, [[TAG_CHAR]]} // CHECK-OLD: [[TS8]] = !{i64 0, i64 4, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]]} // CHECK-OLD: [[TS9]] = !{i64 0, i64 8, [[TAG_CHAR]], i64 8, i64 4, [[TAG_INT]]} // CHECK-OLD: [[TS10]] = !{i64 0, i64 4, [[TAG_INT]], i64 8, i64 8, [[TAG_CHAR]]} diff --git a/clang/test/CodeGenCXX/bitfield-access-empty.cpp b/clang/test/CodeGenCXX/bitfield-access-empty.cpp new file mode 100644 index 0000000000000..c5e6f55ffa696 --- /dev/null +++ b/clang/test/CodeGenCXX/bitfield-access-empty.cpp @@ -0,0 +1,150 @@ +// Check if we can merge bitfields across empty members + +// Configs that have cheap unaligned access +// Little Endian +// RUN: %clang_cc1 -triple=aarch64-apple-darwin %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=aarch64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=arm-apple-darwin %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT-DWN32 %s +// RUN: %clang_cc1 -triple=arm-none-eabi %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=i686-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=loongarch64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=powerpcle-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=ve-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=wasm32 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=wasm64 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=x86_64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s + +// Big Endian +// RUN: %clang_cc1 -triple=powerpc-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=powerpc64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=systemz %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s + +// Configs that have expensive unaligned access +// Little Endian +// RUN: %clang_cc1 -triple=amdgcn-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=arc-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=bpf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=csky %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=hexagon-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=le64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=loongarch32-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=nvptx-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=riscv32 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=riscv64 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=spir-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=xcore-none-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s + +// Big endian +// RUN: %clang_cc1 -triple=lanai-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=m68k-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=mips-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=mips64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=sparc-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=tce-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s + +struct Empty {}; + +struct P1 { + unsigned a : 16; + [[no_unique_address]] Empty e; + unsigned b : 16; +} p1; +// CHECK-LABEL: LLVMType:%struct.P1 = +// LAYOUT-SAME: type { i16, i16 } +// LAYOUT-DWN32-SAME: type { i16, i16 } +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P1 = +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +struct P2 { + unsigned a : 15; + [[no_unique_address]] Empty e; + unsigned b : 15; +} p2; +// CHECK-LABEL: LLVMType:%struct.P2 = +// LAYOUT-SAME: type { i16, i16 } +// LAYOUT-DWN32-SAME: type { i16, i16 } +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P2 = +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +struct P3 { + unsigned a : 16; + Empty e; + unsigned b : 16; +} p3; +// CHECK-LABEL: LLVMType:%struct.P3 = +// LAYOUT-SAME: type { i16, %struct.Empty, i16, [2 x i8] } +// LAYOUT-DWN32-SAME: type <{ i16, %struct.Empty, i16 }> +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P3 = +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +struct P4 { + unsigned : 0; +} p4; +// CHECK-LABEL: LLVMType:%struct.P4 = +// LAYOUT-SAME: type { {{.+}} } +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P4 = +// CHECK: BitFields:[ +// CHECK-NEXT: ]> + +struct P5 { + ~P5(); + unsigned : 0; +} p5; +// CHECK-LABEL: LLVMType:%struct.P5 = +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P5.base = type {} +// CHECK: BitFields:[ +// CHECK-NEXT: ]> + +struct P6 { + unsigned a : 16; + unsigned b : 8; + [[no_unique_address]] Empty e; + unsigned c; +} p6; +// CHECK-LABEL: LLVMType:%struct.P6 = +// LAYOUT-SAME: type { i32, i32 } +// LAYOUT-DWN32-SAME: type { i32, i32 } +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P6 = +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +struct P7 { + unsigned a : 16; + unsigned b : 8; + Empty e; + unsigned c; +} p7; +// CHECK-LABEL: LLVMType:%struct.P7 = +// LAYOUT-SAME: type { i16, i8, %struct.Empty, i32 } +// LAYOUT-DWN32-SAME: type { i16, i8, %struct.Empty, i32 } +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P7 = +// CHECK: BitFields:[ +// LAYOUT-NEXT: diff --git a/clang/test/CodeGenCXX/bitfield-access-tail.cpp b/clang/test/CodeGenCXX/bitfield-access-tail.cpp new file mode 100644 index 0000000000000..68716fdf3b1da --- /dev/null +++ b/clang/test/CodeGenCXX/bitfield-access-tail.cpp @@ -0,0 +1,115 @@ +// Check we use tail padding if it is known to be safe + +// Configs that have cheap unaligned access +// Little Endian +// RUN: %clang_cc1 -triple=aarch64-apple-darwin %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=aarch64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=arm-apple-darwin %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT-DWN32 %s +// RUN: %clang_cc1 -triple=arm-none-eabi %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=i686-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=loongarch64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=powerpcle-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=ve-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=wasm32 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=wasm64 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=x86_64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s + +// Big Endian +// RUN: %clang_cc1 -triple=powerpc-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=powerpc64-linux-gnu %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=systemz %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s + +// Configs that have expensive unaligned access +// Little Endian +// RUN: %clang_cc1 -triple=amdgcn-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=arc-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=bpf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=csky %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=hexagon-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=le64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=loongarch32-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=nvptx-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=riscv32 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=riscv64 %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=spir-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=xcore-none-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s + +// Big endian +// RUN: %clang_cc1 -triple=lanai-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=m68k-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=mips-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=mips64-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=sparc-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s +// RUN: %clang_cc1 -triple=tce-elf %s -emit-llvm -o /dev/null -fdump-record-layouts-simple | FileCheck --check-prefixes CHECK,LAYOUT %s + +// Can use tail padding +struct Pod { + int a : 16; + int b : 8; +} P; +// CHECK-LABEL: LLVMType:%struct.Pod = +// LAYOUT-SAME: type { i32 } +// LAYOUT-DWN32-SAME: type <{ i16, i8 }> +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.Pod = +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +// No tail padding +struct __attribute__((packed)) PPod { + int a : 16; + int b : 8; +} PP; +// CHECK-LABEL: LLVMType:%struct.PPod = +// LAYOUT-SAME: type <{ i16, i8 }> +// LAYOUT-DWN32-SAME: type <{ i16, i8 }> +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.PPod = +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +// Cannot use tail padding +struct NonPod { + ~NonPod(); + int a : 16; + int b : 8; +} NP; +// CHECK-LABEL: LLVMType:%struct.NonPod = +// LAYOUT-SAME: type <{ i16, i8, i8 }> +// LAYOUT-DWN32-SAME: type <{ i16, i8 }> +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct. +// LAYOUT-SAME: NonPod.base = type <{ i16, i8 }> +// LAYOUT-DWN32-SAME: NonPod = type <{ i16, i8 }> +// CHECK: BitFields:[ +// LAYOUT-NEXT: + +// No tail padding +struct __attribute__((packed)) PNonPod { + ~PNonPod(); + int a : 16; + int b : 8; +} PNP; +// CHECK-LABEL: LLVMType:%struct.PNonPod = +// LAYOUT-SAME: type <{ i16, i8 }> +// LAYOUT-DWN32-SAME: type <{ i16, i8 }> +// CHECK-NEXT: NonVirtualBaseLLVMType:%struct.PNonPod = +// CHECK: BitFields:[ +// LAYOUT-NEXT: diff --git a/clang/test/CodeGenCXX/bitfield-ir.cpp b/clang/test/CodeGenCXX/bitfield-ir.cpp new file mode 100644 index 0000000000000..76c144072da68 --- /dev/null +++ b/clang/test/CodeGenCXX/bitfield-ir.cpp @@ -0,0 +1,101 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// RUN: %clang_cc1 -triple x86_64-linux-gnu -O2 -emit-llvm -o - %s | FileCheck %s + +struct Tail { + ~Tail(); + int a : 16; + int b : 8; +}; + +struct Char { + int a : 16; + int b : 8; + char c; +}; + +struct Int { + int a : 16; + int b : 8; + int c; +}; + + +// CHECK-LABEL: define dso_local void @_Z1AP4Tail +// CHECK-SAME: (ptr nocapture noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[P]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add i16 [[BF_LOAD]], 1 +// CHECK-NEXT: store i16 [[INC]], ptr [[P]], align 4 +// CHECK-NEXT: ret void +// +void A (Tail *p) { + p->a++; +} + +// CHECK-LABEL: define dso_local void @_Z1BP4Tail +// CHECK-SAME: (ptr nocapture noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 2 +// CHECK-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[B]], align 2 +// CHECK-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 +// CHECK-NEXT: store i8 [[INC]], ptr [[B]], align 2 +// CHECK-NEXT: ret void +// +void B (Tail *p) { + p->b++; +} + +// CHECK-LABEL: define dso_local void @_Z1AP4Char +// CHECK-SAME: (ptr nocapture noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[BF_LOAD:%.*]] = load i16, ptr [[P]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add i16 [[BF_LOAD]], 1 +// CHECK-NEXT: store i16 [[INC]], ptr [[P]], align 4 +// CHECK-NEXT: ret void +// +void A (Char *p) { + p->a++; +} + +// CHECK-LABEL: define dso_local void @_Z1BP4Char +// CHECK-SAME: (ptr nocapture noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 2 +// CHECK-NEXT: [[BF_LOAD:%.*]] = load i8, ptr [[B]], align 2 +// CHECK-NEXT: [[INC:%.*]] = add i8 [[BF_LOAD]], 1 +// CHECK-NEXT: store i8 [[INC]], ptr [[B]], align 2 +// CHECK-NEXT: ret void +// +void B (Char *p) { + p->b++; +} + +// CHECK-LABEL: define dso_local void @_Z1AP3Int +// CHECK-SAME: (ptr nocapture noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[P]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add i32 [[BF_LOAD]], 1 +// CHECK-NEXT: [[BF_VALUE:%.*]] = and i32 [[INC]], 65535 +// CHECK-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -65536 +// CHECK-NEXT: [[BF_SET:%.*]] = or disjoint i32 [[BF_VALUE]], [[BF_CLEAR]] +// CHECK-NEXT: store i32 [[BF_SET]], ptr [[P]], align 4 +// CHECK-NEXT: ret void +// +void A (Int *p) { + p->a++; +} + +// CHECK-LABEL: define dso_local void @_Z1BP3Int +// CHECK-SAME: (ptr nocapture noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[BF_LOAD:%.*]] = load i32, ptr [[P]], align 4 +// CHECK-NEXT: [[BF_VALUE:%.*]] = add i32 [[BF_LOAD]], 65536 +// CHECK-NEXT: [[BF_SHL2:%.*]] = and i32 [[BF_VALUE]], 16711680 +// CHECK-NEXT: [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD]], -16711681 +// CHECK-NEXT: [[BF_SET:%.*]] = or disjoint i32 [[BF_SHL2]], [[BF_CLEAR]] +// CHECK-NEXT: store i32 [[BF_SET]], ptr [[P]], align 4 +// CHECK-NEXT: ret void +// +void B (Int *p) { + p->b++; +} diff --git a/clang/test/CodeGenCXX/bitfield.cpp b/clang/test/CodeGenCXX/bitfield.cpp index a478eb44915e7..7545e02840e6b 100644 --- a/clang/test/CodeGenCXX/bitfield.cpp +++ b/clang/test/CodeGenCXX/bitfield.cpp @@ -1,7 +1,9 @@ -// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s \ -// RUN: | FileCheck -check-prefix=CHECK-X86-64 %s -// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -emit-llvm -o - %s \ -// RUN: | FileCheck -check-prefix=CHECK-PPC64 %s +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fdump-record-layouts-simple \ +// RUN: -emit-llvm -o %t %s | FileCheck -check-prefixes=LAYOUT,LAYOUT-X86-64 %s +// RUN: FileCheck -check-prefix=CHECK-X86-64 %s <%t +// RUN: %clang_cc1 -triple powerpc64-unknown-unknown -fdump-record-layouts-simple\ +// RUN: -emit-llvm -o %t %s | FileCheck -check-prefixes=LAYOUT,LAYOUT-PPC64 %s +// RUN: FileCheck -check-prefix=CHECK-PPC64 %s <%t // // Tests for bitfield access patterns in C++ with special attention to // conformance to C++11 memory model requirements. @@ -19,6 +21,27 @@ namespace N0 { unsigned b70 : 6; unsigned b71 : 2; }; +// LAYOUT-LABEL: LLVMType:%"struct.N0::S" = +// LAYOUT-SAME: type { i64 } +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + unsigned read00(S* s) { // CHECK-X86-64-LABEL: define{{.*}} i32 @_ZN2N06read00 // CHECK-X86-64: %[[val:.*]] = load i64, ptr %{{.*}} @@ -149,6 +172,13 @@ namespace N1 { unsigned b : 1; char c; }; +// LAYOUT-LABEL: LLVMType:%"struct.N1::S" = +// LAYOUT-SAME: type { i8, i8, i8, i8 } +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + unsigned read(S* s) { // CHECK-X86-64-LABEL: define{{.*}} i32 @_ZN2N14read // CHECK-X86-64: %[[ptr:.*]] = getelementptr inbounds %{{.*}}, ptr %{{.*}}, i32 0, i32 1 @@ -193,6 +223,13 @@ namespace N2 { unsigned b : 24; void *p; }; +// LAYOUT-LABEL: LLVMType:%"struct.N2::S" = +// LAYOUT-SAME: type { i32, ptr } +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + unsigned read(S* s) { // CHECK-X86-64-LABEL: define{{.*}} i32 @_ZN2N24read // CHECK-X86-64: %[[val:.*]] = load i32, ptr %{{.*}} @@ -230,6 +267,13 @@ namespace N3 { struct S { unsigned b : 24; }; +// LAYOUT-LABEL: LLVMType:%"struct.N3::S" = +// LAYOUT-SAME: type { i32 } +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + unsigned read(S* s) { // CHECK-X86-64-LABEL: define{{.*}} i32 @_ZN2N34read // CHECK-X86-64: %[[val:.*]] = load i32, ptr %{{.*}} @@ -276,6 +320,14 @@ namespace N4 { char c; }; #endif +// LAYOUT-LABEL: LLVMType:%"struct.N4::Base" = +// LAYOUT-SAME: type <{ ptr, [3 x i8], [5 x i8] }> +// LAYOUT-NEXT: NonVirtualBaseLLVMType:%"struct.N4::Base.base" = type <{ ptr, [3 x i8] }> +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + unsigned read(Base* s) { // FIXME: We should widen this load as long as the function isn't being // instrumented by ThreadSanitizer. @@ -317,6 +369,22 @@ namespace N5 { struct X { unsigned b : 24; char c; } x; struct Y { unsigned b : 24; } y; }; +// LAYOUT-LABEL: LLVMType:%"struct.N5::U::X" = +// LAYOUT-SAME: type { [3 x i8], i8 } +// LAYOUT-NEXT: NonVirtualBaseLLVMType:%"struct.N5::U::X" = +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + +// LAYOUT-LABEL: LLVMType:%"struct.N5::U::Y" = +// LAYOUT-SAME: type { i32 } +// LAYOUT-NEXT: NonVirtualBaseLLVMType:%"struct.N5::U::Y" = +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + unsigned read(U* u) { // CHECK-X86-64-LABEL: define{{.*}} i32 @_ZN2N54read // CHECK-X86-64: %[[val:.*]] = load i32, ptr %{{.*}} @@ -360,6 +428,15 @@ namespace N6 { unsigned char : 0; unsigned char b2 : 8; }; +// LAYOUT-LABEL: LLVMType:%"struct.N6::S" = +// LAYOUT-SAME: type { [3 x i8], i8 } +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + unsigned read(S* s) { // CHECK-X86-64-LABEL: define{{.*}} i32 @_ZN2N64read // CHECK-X86-64: %[[val1:.*]] = load i24, ptr %{{.*}} @@ -416,6 +493,22 @@ namespace N7 { char c; }; #endif +// LAYOUT-LABEL: LLVMType:%"struct.N7::B1" = +// LAYOUT-SAME: type <{ ptr, [3 x i8], [5 x i8] }> +// LAYOUT-NEXT: NonVirtualBaseLLVMType:%"struct.N7::B1.base" = type <{ ptr, [3 x i8] }> +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + +// LAYOUT-LABEL: LLVMType:%"struct.N7::B2" = +// LAYOUT-SAME: type <{ ptr, [3 x i8], [5 x i8], %"struct.N7::B1.base", [5 x i8] }> +// LAYOUT-NEXT: NonVirtualBaseLLVMType:%"struct.N7::B2.base" = type <{ ptr, [3 x i8] }> +// LAYOUT: BitFields:[ +// LAYOUT-X86-64-NEXT: + unsigned read(B2* s) { // FIXME: We should widen this load as long as the function isn't being // instrumented by ThreadSanitizer. diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl index b9f35bd3712d1..33d761dbdfbea 100644 --- a/clang/test/CodeGenHLSL/builtins/round.hlsl +++ b/clang/test/CodeGenHLSL/builtins/round.hlsl @@ -7,47 +7,47 @@ // RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF // NATIVE_HALF: define noundef half @ -// NATIVE_HALF: %elt.round = call half @llvm.round.f16( -// NATIVE_HALF: ret half %elt.round +// NATIVE_HALF: %elt.roundeven = call half @llvm.roundeven.f16( +// NATIVE_HALF: ret half %elt.roundeven // NO_HALF: define noundef float @"?test_round_half@@YA$halff@$halff@@Z"( -// NO_HALF: %elt.round = call float @llvm.round.f32( -// NO_HALF: ret float %elt.round +// NO_HALF: %elt.roundeven = call float @llvm.roundeven.f32( +// NO_HALF: ret float %elt.roundeven half test_round_half(half p0) { return round(p0); } // NATIVE_HALF: define noundef <2 x half> @ -// NATIVE_HALF: %elt.round = call <2 x half> @llvm.round.v2f16 -// NATIVE_HALF: ret <2 x half> %elt.round +// NATIVE_HALF: %elt.roundeven = call <2 x half> @llvm.roundeven.v2f16 +// NATIVE_HALF: ret <2 x half> %elt.roundeven // NO_HALF: define noundef <2 x float> @ -// NO_HALF: %elt.round = call <2 x float> @llvm.round.v2f32( -// NO_HALF: ret <2 x float> %elt.round +// NO_HALF: %elt.roundeven = call <2 x float> @llvm.roundeven.v2f32( +// NO_HALF: ret <2 x float> %elt.roundeven half2 test_round_half2(half2 p0) { return round(p0); } // NATIVE_HALF: define noundef <3 x half> @ -// NATIVE_HALF: %elt.round = call <3 x half> @llvm.round.v3f16 -// NATIVE_HALF: ret <3 x half> %elt.round +// NATIVE_HALF: %elt.roundeven = call <3 x half> @llvm.roundeven.v3f16 +// NATIVE_HALF: ret <3 x half> %elt.roundeven // NO_HALF: define noundef <3 x float> @ -// NO_HALF: %elt.round = call <3 x float> @llvm.round.v3f32( -// NO_HALF: ret <3 x float> %elt.round +// NO_HALF: %elt.roundeven = call <3 x float> @llvm.roundeven.v3f32( +// NO_HALF: ret <3 x float> %elt.roundeven half3 test_round_half3(half3 p0) { return round(p0); } // NATIVE_HALF: define noundef <4 x half> @ -// NATIVE_HALF: %elt.round = call <4 x half> @llvm.round.v4f16 -// NATIVE_HALF: ret <4 x half> %elt.round +// NATIVE_HALF: %elt.roundeven = call <4 x half> @llvm.roundeven.v4f16 +// NATIVE_HALF: ret <4 x half> %elt.roundeven // NO_HALF: define noundef <4 x float> @ -// NO_HALF: %elt.round = call <4 x float> @llvm.round.v4f32( -// NO_HALF: ret <4 x float> %elt.round +// NO_HALF: %elt.roundeven = call <4 x float> @llvm.roundeven.v4f32( +// NO_HALF: ret <4 x float> %elt.roundeven half4 test_round_half4(half4 p0) { return round(p0); } // CHECK: define noundef float @ -// CHECK: %elt.round = call float @llvm.round.f32( -// CHECK: ret float %elt.round +// CHECK: %elt.roundeven = call float @llvm.roundeven.f32( +// CHECK: ret float %elt.roundeven float test_round_float(float p0) { return round(p0); } // CHECK: define noundef <2 x float> @ -// CHECK: %elt.round = call <2 x float> @llvm.round.v2f32 -// CHECK: ret <2 x float> %elt.round +// CHECK: %elt.roundeven = call <2 x float> @llvm.roundeven.v2f32 +// CHECK: ret <2 x float> %elt.roundeven float2 test_round_float2(float2 p0) { return round(p0); } // CHECK: define noundef <3 x float> @ -// CHECK: %elt.round = call <3 x float> @llvm.round.v3f32 -// CHECK: ret <3 x float> %elt.round +// CHECK: %elt.roundeven = call <3 x float> @llvm.roundeven.v3f32 +// CHECK: ret <3 x float> %elt.roundeven float3 test_round_float3(float3 p0) { return round(p0); } // CHECK: define noundef <4 x float> @ -// CHECK: %elt.round = call <4 x float> @llvm.round.v4f32 -// CHECK: ret <4 x float> %elt.round +// CHECK: %elt.roundeven = call <4 x float> @llvm.roundeven.v4f32 +// CHECK: ret <4 x float> %elt.roundeven float4 test_round_float4(float4 p0) { return round(p0); } diff --git a/clang/test/Driver/darwin-ld-reexports.c b/clang/test/Driver/darwin-ld-reexports.c new file mode 100644 index 0000000000000..2e96db49a8a38 --- /dev/null +++ b/clang/test/Driver/darwin-ld-reexports.c @@ -0,0 +1,21 @@ +// RUN: touch %t.o +// RUN: %clang -target arm64-apple-darwin13 -### \ +// RUN: -reexport_framework Foo -reexport-lBar -reexport_library Baz %t.o 2> %t.log + +// Check older spellings also work. +// RUN: %clang -target arm64-apple-darwin13 -### \ +// RUN: -Xlinker -reexport_framework -Xlinker Forest \ +// RUN: -Xlinker -reexport-lBranch \ +// RUN: -Xlinker -reexport_library -Xlinker Flower %t.o 2>> %t.log +// RUN: FileCheck -check-prefix=LINK_REEXPORT %s < %t.log + +// LINK_REEXPORT: {{ld(.exe)?"}} +// LINK_REEXPORT: "-reexport_framework" "Foo" +// LINK_REEXPORT: "-reexport-lBar" +// LINK_REEXPORT: "-reexport_library" "Baz" +// LINK_REEXPORT: "-reexport_framework" "Forest" +// LINK_REEXPORT: "-reexport-lBranch" +// LINK_REEXPORT: "-reexport_library" "Flower" + +// Make sure arguments are not repeated. +// LINK_REEXPORT-NOT: "-reexport diff --git a/clang/test/InstallAPI/diagnostics-dsym.test b/clang/test/InstallAPI/diagnostics-dsym.test new file mode 100644 index 0000000000000..8a1b394f2f868 --- /dev/null +++ b/clang/test/InstallAPI/diagnostics-dsym.test @@ -0,0 +1,39 @@ +; REQUIRES: 86_64-darwin + +; RUN: rm -rf %t +; RUN: split-file %s %t + +// Build a simple dylib with debug info. +; RUN: %clang --target=x86_64-apple-macos10.15 -g -dynamiclib %t/foo.c \ +; RUN: -current_version 1 -compatibility_version 1 -L%t/usr/lib \ +; RUN: -save-temps \ +; RUN: -o %t/foo.dylib -install_name %t/foo.dylib +; RUN: dsymutil %t/foo.dylib -o %t/foo.dSYM + +; RUN: not clang-installapi -x c++ --target=x86_64-apple-macos10.15 \ +; RUN: -install_name %t/foo.dylib \ +; RUN: -current_version 1 -compatibility_version 1 \ +; RUN: -o %t/output.tbd \ +; RUN: --verify-against=%t/foo.dylib --dsym=%t/foo.dSYM \ +; RUN: --verify-mode=Pedantic 2>&1 | FileCheck %s + +; CHECK: violations found for x86_64 +; CHECK: foo.c:5:0: error: no declaration found for exported symbol 'bar' in dynamic library +; CHECK: foo.c:1:0: error: no declaration found for exported symbol 'foo' in dynamic library + +;--- foo.c +int foo(void) { + return 1; +} +extern char bar; +char bar = 'a'; + +;--- usr/lib/libSystem.tbd +--- !tapi-tbd +tbd-version: 4 +targets: [ x86_64-macos ] +install-name: '/usr/lib/libSystem.B.dylib' +exports: + - targets: [ x86_64-macos ] + symbols: [ dyld_stub_binder ] +... diff --git a/clang/test/Modules/no-local-decl-in-reduced-bmi.cppm b/clang/test/Modules/no-local-decl-in-reduced-bmi.cppm new file mode 100644 index 0000000000000..41ae2bf0dec80 --- /dev/null +++ b/clang/test/Modules/no-local-decl-in-reduced-bmi.cppm @@ -0,0 +1,33 @@ +// Test that we won't record local declarations by default in reduced BMI. + +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: cd %t +// +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm +// RUN: llvm-bcanalyzer --dump --disable-histogram --show-binary-blobs %t/a.pcm > %t/a.dump +// RUN: cat %t/a.dump | FileCheck %t/a.cppm +// +// RUN: %clang_cc1 -std=c++20 %t/b.cppm -emit-reduced-module-interface -o %t/b.pcm +// RUN: llvm-bcanalyzer --dump --disable-histogram --show-binary-blobs %t/b.pcm > %t/b.dump +// RUN: cat %t/b.dump | FileCheck %t/b.cppm + +//--- a.cppm +export module a; +export int func() { + int v = 43; + return 43; +} + +// Test that the variable declaration is not recorded completely. +// CHECK-NOT: struct AddNonNull { typedef _Nonnull T type; // expected-error{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'int'}} // expected-error@-1{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'std::nullptr_t'}} - // expected-error@-2{{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'NotPtr'}} }; typedef AddNonNull::type nonnull_int_ptr_1; @@ -40,33 +35,6 @@ typedef AddNonNull::type nonnull_int_ptr_3; // expected-note{{in inst typedef AddNonNull::type nonnull_non_pointer_1; // expected-note{{in instantiation of template class 'AddNonNull' requested here}} -// Nullability on C++ class types (smart pointers). -struct NotPtr{}; -typedef AddNonNull::type nonnull_non_pointer_2; // expected-note{{in instantiation}} -struct _Nullable SmartPtr{ - SmartPtr(); - SmartPtr(nullptr_t); - SmartPtr(const SmartPtr&); - SmartPtr(SmartPtr&&); - SmartPtr &operator=(const SmartPtr&); - SmartPtr &operator=(SmartPtr&&); -}; -typedef AddNonNull::type nonnull_smart_pointer_1; -template struct _Nullable SmartPtrTemplate{}; -typedef AddNonNull>::type nonnull_smart_pointer_2; -namespace std { inline namespace __1 { - template class unique_ptr {}; - template class function; - template class function {}; -} } -typedef AddNonNull>::type nonnull_smart_pointer_3; -typedef AddNonNull>::type nonnull_smart_pointer_4; - -class Derived : public SmartPtr {}; -Derived _Nullable x; // expected-error {{'_Nullable' cannot be applied}} -class DerivedPrivate : private SmartPtr {}; -DerivedPrivate _Nullable y; // expected-error {{'_Nullable' cannot be applied}} - // Non-null checking within a template. template struct AddNonNull2 { @@ -86,7 +54,6 @@ void (*& accepts_nonnull_2)(_Nonnull int *ptr) = accepts_nonnull_1; void (X::* accepts_nonnull_3)(_Nonnull int *ptr); void accepts_nonnull_4(_Nonnull int *ptr); void (&accepts_nonnull_5)(_Nonnull int *ptr) = accepts_nonnull_4; -void accepts_nonnull_6(SmartPtr _Nonnull); void test_accepts_nonnull_null_pointer_literal(X *x) { accepts_nonnull_1(0); // expected-warning{{null passed to a callee that requires a non-null argument}} @@ -94,8 +61,6 @@ void test_accepts_nonnull_null_pointer_literal(X *x) { (x->*accepts_nonnull_3)(0); // expected-warning{{null passed to a callee that requires a non-null argument}} accepts_nonnull_4(0); // expected-warning{{null passed to a callee that requires a non-null argument}} accepts_nonnull_5(0); // expected-warning{{null passed to a callee that requires a non-null argument}} - - accepts_nonnull_6(nullptr); // expected-warning{{null passed to a callee that requires a non-null argument}} } template @@ -106,7 +71,6 @@ void test_accepts_nonnull_null_pointer_literal_template() { template void test_accepts_nonnull_null_pointer_literal_template<&accepts_nonnull_4>(); // expected-note{{instantiation of function template specialization}} void TakeNonnull(void *_Nonnull); -void TakeSmartNonnull(SmartPtr _Nonnull); // Check different forms of assignment to a nonull type from a nullable one. void AssignAndInitNonNull() { void *_Nullable nullable; @@ -117,26 +81,12 @@ void AssignAndInitNonNull() { void *_Nonnull nonnull; nonnull = nullable; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}} nonnull = {nullable}; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}} + TakeNonnull(nullable); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}} TakeNonnull(nonnull); // OK - nonnull = (void *_Nonnull)nullable; // explicit cast OK - - SmartPtr _Nullable s_nullable; - SmartPtr _Nonnull s(s_nullable); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - SmartPtr _Nonnull s2{s_nullable}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - SmartPtr _Nonnull s3 = {s_nullable}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - SmartPtr _Nonnull s4 = s_nullable; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - SmartPtr _Nonnull s_nonnull; - s_nonnull = s_nullable; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - s_nonnull = {s_nullable}; // no warning here - might be nice? - TakeSmartNonnull(s_nullable); //expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull}} - TakeSmartNonnull(s_nonnull); // OK - s_nonnull = (SmartPtr _Nonnull)s_nullable; // explicit cast OK - s_nonnull = static_cast(s_nullable); // explicit cast OK } void *_Nullable ReturnNullable(); -SmartPtr _Nullable ReturnSmartNullable(); void AssignAndInitNonNullFromFn() { void *_Nonnull p(ReturnNullable()); // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}} @@ -146,16 +96,8 @@ void AssignAndInitNonNullFromFn() { void *_Nonnull nonnull; nonnull = ReturnNullable(); // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}} nonnull = {ReturnNullable()}; // expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull'}} - TakeNonnull(ReturnNullable()); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}} - SmartPtr _Nonnull s(ReturnSmartNullable()); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - SmartPtr _Nonnull s2{ReturnSmartNullable()}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - SmartPtr _Nonnull s3 = {ReturnSmartNullable()}; // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - SmartPtr _Nonnull s4 = ReturnSmartNullable(); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - SmartPtr _Nonnull s_nonnull; - s_nonnull = ReturnSmartNullable(); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} - s_nonnull = {ReturnSmartNullable()}; - TakeSmartNonnull(ReturnSmartNullable()); // expected-warning{{implicit conversion from nullable pointer 'SmartPtr _Nullable' to non-nullable pointer type 'SmartPtr _Nonnull'}} + TakeNonnull(ReturnNullable()); //expected-warning{{implicit conversion from nullable pointer 'void * _Nullable' to non-nullable pointer type 'void * _Nonnull}} } void ConditionalExpr(bool c) { diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl index c56986b7f8622..98c02c38675f4 100644 --- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors.hlsl @@ -8,6 +8,7 @@ // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_log10 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sin // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_sqrt +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_roundeven // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_trunc double2 test_double_builtin(double2 p0) { diff --git a/clang/test/SemaObjCXX/nullability-consistency.mm b/clang/test/SemaObjCXX/nullability-consistency.mm index f8b6a4c082d40..6921d8b9d3dd5 100644 --- a/clang/test/SemaObjCXX/nullability-consistency.mm +++ b/clang/test/SemaObjCXX/nullability-consistency.mm @@ -1,8 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -fblocks -I %S/Inputs -isystem %S/Inputs/nullability-consistency-system %s -verify // RUN: %clang_cc1 -fsyntax-only -fblocks -I %S/Inputs -isystem %S/Inputs/nullability-consistency-system %s -Wsystem-headers -DWARN_IN_SYSTEM_HEADERS -verify -// XFAIL: * - #include "nullability-consistency-1.h" #include "nullability-consistency-3.h" #include "nullability-consistency-4.h" @@ -11,7 +9,6 @@ #include "nullability-consistency-6.h" #include "nullability-consistency-7.h" #include "nullability-consistency-8.h" -#include "nullability-consistency-smart.h" #include "nullability-consistency-system.h" void h1(int *ptr) { } // don't warn diff --git a/clang/test/SemaTemplate/concepts-friends.cpp b/clang/test/SemaTemplate/concepts-friends.cpp index 255b0858917fb..91b797034ed6c 100644 --- a/clang/test/SemaTemplate/concepts-friends.cpp +++ b/clang/test/SemaTemplate/concepts-friends.cpp @@ -478,3 +478,29 @@ template class Foo { }; } // namespace FriendOfFriend + +namespace GH86769 { + +template +concept X = true; + +template struct Y { + Y(T) {} + template friend struct Y; + template friend struct Y; + template friend struct Y; +}; + +template +struct Z { + // FIXME: This is ill-formed per C++11 N3337 [temp.param]p12: + // A default template argument shall not be specified in a friend class + // template declaration. + template friend struct Y; +}; + +template struct Y; +template struct Z; +Y y(1); + +} diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp index 388ed7d4cced1..ec144d4f44ba8 100644 --- a/clang/test/SemaTemplate/ctad.cpp +++ b/clang/test/SemaTemplate/ctad.cpp @@ -53,4 +53,4 @@ X x; template struct Y { Y(T); }; template struct Y ; Y y(1); -}; +} diff --git a/clang/tools/clang-installapi/InstallAPIOpts.td b/clang/tools/clang-installapi/InstallAPIOpts.td index 71532c9cf24d1..010f2507a1d1f 100644 --- a/clang/tools/clang-installapi/InstallAPIOpts.td +++ b/clang/tools/clang-installapi/InstallAPIOpts.td @@ -29,6 +29,8 @@ def verify_mode_EQ : Joined<["--"], "verify-mode=">, HelpText<"Specify the severity and extend of the validation. Valid modes are ErrorsOnly, ErrorsAndWarnings, and Pedantic.">; def demangle : Flag<["--", "-"], "demangle">, HelpText<"Demangle symbols when printing warnings and errors">; +def dsym: Joined<["--"], "dsym=">, + MetaVarName<"">, HelpText<"Specify dSYM path for enriched diagnostics.">; // Additional input options. def extra_project_header : Separate<["-"], "extra-project-header">, diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp index 8e4a1b019fd81..c4f39b7c84174 100644 --- a/clang/tools/clang-installapi/Options.cpp +++ b/clang/tools/clang-installapi/Options.cpp @@ -241,6 +241,9 @@ Options::processAndFilterOutInstallAPIOptions(ArrayRef Args) { if (const Arg *A = ParsedArgs.getLastArg(OPT_verify_against)) DriverOpts.DylibToVerify = A->getValue(); + if (const Arg *A = ParsedArgs.getLastArg(OPT_dsym)) + DriverOpts.DSYMPath = A->getValue(); + // Handle exclude & extra header directories or files. auto handleAdditionalInputArgs = [&](PathSeq &Headers, clang::installapi::ID OptID) { @@ -522,7 +525,8 @@ InstallAPIContext Options::createContext() { } Ctx.Verifier = std::make_unique( - std::move(*Slices), Diags, DriverOpts.VerifyMode, DriverOpts.Demangle); + std::move(*Slices), Diags, DriverOpts.VerifyMode, DriverOpts.Demangle, + DriverOpts.DSYMPath); return Ctx; } diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h index 3671e4c8274bd..82e04b49d1259 100644 --- a/clang/tools/clang-installapi/Options.h +++ b/clang/tools/clang-installapi/Options.h @@ -67,6 +67,9 @@ struct DriverOptions { /// \brief Output path. std::string OutputPath; + /// \brief DSYM path. + std::string DSYMPath; + /// \brief File encoding to print. FileType OutFT = FileType::TBD_V5; diff --git a/clang/unittests/AST/DeclPrinterTest.cpp b/clang/unittests/AST/DeclPrinterTest.cpp index 8a29d0544a04b..f2b027a25621c 100644 --- a/clang/unittests/AST/DeclPrinterTest.cpp +++ b/clang/unittests/AST/DeclPrinterTest.cpp @@ -1387,34 +1387,38 @@ TEST(DeclPrinter, TestTemplateArgumentList16) { } TEST(DeclPrinter, TestCXXRecordDecl17) { - ASSERT_TRUE(PrintedDeclCXX98Matches("template struct Z {};" - "struct X {};" - "Z A;", - "A", "Z A")); - (void)[](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; }; + ASSERT_TRUE(PrintedDeclCXX98Matches( + "template struct Z {};" + "struct X {};" + "Z A;", + "A", "Z A", + [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; })); } TEST(DeclPrinter, TestCXXRecordDecl18) { - ASSERT_TRUE(PrintedDeclCXX98Matches("template struct Z {};" - "struct X {};" - "Z A;" - "template " - "struct Y{};" - "Y, 2> B;", - "B", "Y, 2> B")); - (void)[](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; }; + ASSERT_TRUE(PrintedDeclCXX98Matches( + "template struct Z {};" + "struct X {};" + "Z A;" + "template " + "struct Y{};" + "Y, 2> B;", + "B", "Y, 2> B", + [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; })); } TEST(DeclPrinter, TestCXXRecordDecl19) { - ASSERT_TRUE(PrintedDeclCXX98Matches("template struct Z {};" - "struct X {};" - "Z A;" - "template " - "struct Y{};" - "Y, 2> B;", - "B", "Y, 2> B")); - (void)[](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = true; }; + ASSERT_TRUE(PrintedDeclCXX98Matches( + "template struct Z {};" + "struct X {};" + "Z A;" + "template " + "struct Y{};" + "Y, 2> B;", + "B", "Y, 2> B", + [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = true; })); } + TEST(DeclPrinter, TestCXXRecordDecl20) { ASSERT_TRUE(PrintedDeclCXX98Matches( "template class Inner;" @@ -1431,8 +1435,8 @@ TEST(DeclPrinter, TestCXXRecordDecl20) { "};" "Outer, 5>::NestedStruct nestedInstance(100);", "nestedInstance", - "Outer, 5>::NestedStruct nestedInstance(100)")); - (void)[](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; }; + "Outer, 5>::NestedStruct nestedInstance(100)", + [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = false; })); } TEST(DeclPrinter, TestCXXRecordDecl21) { @@ -1451,8 +1455,8 @@ TEST(DeclPrinter, TestCXXRecordDecl21) { "};" "Outer, 5>::NestedStruct nestedInstance(100);", "nestedInstance", - "Outer, 5>::NestedStruct nestedInstance(100)")); - (void)[](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = true; }; + "Outer, 5>::NestedStruct nestedInstance(100)", + [](PrintingPolicy &Policy) { Policy.SuppressTagKeyword = true; })); } TEST(DeclPrinter, TestFunctionParamUglified) { diff --git a/clang/unittests/AST/DeclTest.cpp b/clang/unittests/AST/DeclTest.cpp index cef0f8711416b..2530ce74eb6a3 100644 --- a/clang/unittests/AST/DeclTest.cpp +++ b/clang/unittests/AST/DeclTest.cpp @@ -429,7 +429,7 @@ TEST(Decl, ImplicitlyDeclaredAllocationFunctionsInModules) { .bind("operator new"), Ctx)); ASSERT_TRUE(SizedOperatorNew->getOwningModule()); - EXPECT_TRUE(SizedOperatorNew->getOwningModule()->isGlobalModule()); + EXPECT_TRUE(SizedOperatorNew->isFromExplicitGlobalModule()); // void* operator new(std::size_t, std::align_val_t); auto *SizedAlignedOperatorNew = selectFirst( @@ -441,7 +441,7 @@ TEST(Decl, ImplicitlyDeclaredAllocationFunctionsInModules) { .bind("operator new"), Ctx)); ASSERT_TRUE(SizedAlignedOperatorNew->getOwningModule()); - EXPECT_TRUE(SizedAlignedOperatorNew->getOwningModule()->isGlobalModule()); + EXPECT_TRUE(SizedAlignedOperatorNew->isFromExplicitGlobalModule()); // void* operator new[](std::size_t); auto *SizedArrayOperatorNew = selectFirst( @@ -451,7 +451,7 @@ TEST(Decl, ImplicitlyDeclaredAllocationFunctionsInModules) { .bind("operator new[]"), Ctx)); ASSERT_TRUE(SizedArrayOperatorNew->getOwningModule()); - EXPECT_TRUE(SizedArrayOperatorNew->getOwningModule()->isGlobalModule()); + EXPECT_TRUE(SizedArrayOperatorNew->isFromExplicitGlobalModule()); // void* operator new[](std::size_t, std::align_val_t); auto *SizedAlignedArrayOperatorNew = selectFirst( @@ -464,7 +464,7 @@ TEST(Decl, ImplicitlyDeclaredAllocationFunctionsInModules) { Ctx)); ASSERT_TRUE(SizedAlignedArrayOperatorNew->getOwningModule()); EXPECT_TRUE( - SizedAlignedArrayOperatorNew->getOwningModule()->isGlobalModule()); + SizedAlignedArrayOperatorNew->isFromExplicitGlobalModule()); // void operator delete(void*) noexcept; auto *Delete = selectFirst( @@ -475,7 +475,7 @@ TEST(Decl, ImplicitlyDeclaredAllocationFunctionsInModules) { .bind("operator delete"), Ctx)); ASSERT_TRUE(Delete->getOwningModule()); - EXPECT_TRUE(Delete->getOwningModule()->isGlobalModule()); + EXPECT_TRUE(Delete->isFromExplicitGlobalModule()); // void operator delete(void*, std::align_val_t) noexcept; auto *AlignedDelete = selectFirst( @@ -487,7 +487,7 @@ TEST(Decl, ImplicitlyDeclaredAllocationFunctionsInModules) { .bind("operator delete"), Ctx)); ASSERT_TRUE(AlignedDelete->getOwningModule()); - EXPECT_TRUE(AlignedDelete->getOwningModule()->isGlobalModule()); + EXPECT_TRUE(AlignedDelete->isFromExplicitGlobalModule()); // Sized deallocation is not enabled by default. So we skip it here. @@ -500,7 +500,7 @@ TEST(Decl, ImplicitlyDeclaredAllocationFunctionsInModules) { .bind("operator delete[]"), Ctx)); ASSERT_TRUE(ArrayDelete->getOwningModule()); - EXPECT_TRUE(ArrayDelete->getOwningModule()->isGlobalModule()); + EXPECT_TRUE(ArrayDelete->isFromExplicitGlobalModule()); // void operator delete[](void*, std::align_val_t) noexcept; auto *AlignedArrayDelete = selectFirst( @@ -512,7 +512,7 @@ TEST(Decl, ImplicitlyDeclaredAllocationFunctionsInModules) { .bind("operator delete[]"), Ctx)); ASSERT_TRUE(AlignedArrayDelete->getOwningModule()); - EXPECT_TRUE(AlignedArrayDelete->getOwningModule()->isGlobalModule()); + EXPECT_TRUE(AlignedArrayDelete->isFromExplicitGlobalModule()); } TEST(Decl, TemplateArgumentDefaulted) { diff --git a/clang/unittests/Lex/PPDependencyDirectivesTest.cpp b/clang/unittests/Lex/PPDependencyDirectivesTest.cpp index 6ff87f720a559..0c396720ece66 100644 --- a/clang/unittests/Lex/PPDependencyDirectivesTest.cpp +++ b/clang/unittests/Lex/PPDependencyDirectivesTest.cpp @@ -117,11 +117,6 @@ TEST_F(PPDependencyDirectivesTest, MacroGuard) { }; auto PPOpts = std::make_shared(); - PPOpts->DependencyDirectivesForFile = [&](FileEntryRef File) - -> std::optional> { - return getDependencyDirectives(File); - }; - TrivialModuleLoader ModLoader; HeaderSearch HeaderInfo(std::make_shared(), SourceMgr, Diags, LangOpts, Target.get()); @@ -130,6 +125,12 @@ TEST_F(PPDependencyDirectivesTest, MacroGuard) { /*OwnsHeaderSearch =*/false); PP.Initialize(*Target); + PP.setDependencyDirectivesFn( + [&](FileEntryRef File) + -> std::optional> { + return getDependencyDirectives(File); + }); + SmallVector IncludedFiles; PP.addPPCallbacks(std::make_unique(PP, IncludedFiles)); PP.EnterMainSourceFile(); diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index f4dd90aac6655..e7bc90cd0960e 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -18,6 +18,7 @@ #include "local_cache.h" #include "mem_map.h" #include "memtag.h" +#include "mutex.h" #include "options.h" #include "quarantine.h" #include "report.h" @@ -178,17 +179,17 @@ class Allocator { Quarantine.init( static_cast(getFlags()->quarantine_size_kb << 10), static_cast(getFlags()->thread_local_quarantine_size_kb << 10)); - - mapAndInitializeRingBuffer(); } - void enableRingBuffer() { + void enableRingBuffer() NO_THREAD_SAFETY_ANALYSIS { AllocationRingBuffer *RB = getRingBuffer(); if (RB) RB->Depot->enable(); + RingBufferInitLock.unlock(); } - void disableRingBuffer() { + void disableRingBuffer() NO_THREAD_SAFETY_ANALYSIS { + RingBufferInitLock.lock(); AllocationRingBuffer *RB = getRingBuffer(); if (RB) RB->Depot->disable(); @@ -915,9 +916,11 @@ class Allocator { DCHECK(!Primary.Options.load().get(OptionBit::TrackAllocationStacks)); return; } - if (Track) + + if (Track) { + initRingBufferMaybe(); Primary.Options.set(OptionBit::TrackAllocationStacks); - else + } else Primary.Options.clear(OptionBit::TrackAllocationStacks); } @@ -1092,6 +1095,9 @@ class Allocator { 0, "invalid alignment"); + // Lock to initialize the RingBuffer + HybridMutex RingBufferInitLock; + // Pointer to memory mapped area starting with AllocationRingBuffer struct, // and immediately followed by Size elements of type Entry. atomic_uptr RingBufferAddress = {}; @@ -1546,11 +1552,16 @@ class Allocator { RBEntryStart)[N]; } - void mapAndInitializeRingBuffer() { - if (getFlags()->allocation_ring_buffer_size <= 0) + void initRingBufferMaybe() { + ScopedLock L(RingBufferInitLock); + if (getRingBuffer() != nullptr) return; - u32 AllocationRingBufferSize = - static_cast(getFlags()->allocation_ring_buffer_size); + + int ring_buffer_size = getFlags()->allocation_ring_buffer_size; + if (ring_buffer_size <= 0) + return; + + u32 AllocationRingBufferSize = static_cast(ring_buffer_size); // We store alloc and free stacks for each entry. constexpr u32 kStacksPerRingBufferEntry = 2; diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 6a311adc55e4b..1a36155bcd423 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -867,32 +867,86 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, ReallocateInPlaceStress) { } } +SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferDefaultDisabled) { + // The RingBuffer is not initialized until tracking is enabled for the + // first time. + auto *Allocator = this->Allocator.get(); + EXPECT_EQ(0u, Allocator->getRingBufferSize()); + EXPECT_EQ(nullptr, Allocator->getRingBufferAddress()); +} + +SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferInitOnce) { + auto *Allocator = this->Allocator.get(); + Allocator->setTrackAllocationStacks(true); + + auto RingBufferSize = Allocator->getRingBufferSize(); + ASSERT_GT(RingBufferSize, 0u); + auto *RingBufferAddress = Allocator->getRingBufferAddress(); + EXPECT_NE(nullptr, RingBufferAddress); + + // Enable tracking again to verify that the initialization only happens once. + Allocator->setTrackAllocationStacks(true); + ASSERT_EQ(RingBufferSize, Allocator->getRingBufferSize()); + EXPECT_EQ(RingBufferAddress, Allocator->getRingBufferAddress()); +} + SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferSize) { auto *Allocator = this->Allocator.get(); - auto Size = Allocator->getRingBufferSize(); - ASSERT_GT(Size, 0u); - EXPECT_EQ(Allocator->getRingBufferAddress()[Size - 1], '\0'); + Allocator->setTrackAllocationStacks(true); + + auto RingBufferSize = Allocator->getRingBufferSize(); + ASSERT_GT(RingBufferSize, 0u); + EXPECT_EQ(Allocator->getRingBufferAddress()[RingBufferSize - 1], '\0'); } SCUDO_TYPED_TEST(ScudoCombinedTest, RingBufferAddress) { auto *Allocator = this->Allocator.get(); - auto *Addr = Allocator->getRingBufferAddress(); - EXPECT_NE(Addr, nullptr); - EXPECT_EQ(Addr, Allocator->getRingBufferAddress()); + Allocator->setTrackAllocationStacks(true); + + auto *RingBufferAddress = Allocator->getRingBufferAddress(); + EXPECT_NE(RingBufferAddress, nullptr); + EXPECT_EQ(RingBufferAddress, Allocator->getRingBufferAddress()); +} + +SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepotDefaultDisabled) { + // The StackDepot is not initialized until tracking is enabled for the + // first time. + auto *Allocator = this->Allocator.get(); + EXPECT_EQ(0u, Allocator->getStackDepotSize()); + EXPECT_EQ(nullptr, Allocator->getStackDepotAddress()); +} + +SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepotInitOnce) { + auto *Allocator = this->Allocator.get(); + Allocator->setTrackAllocationStacks(true); + + auto StackDepotSize = Allocator->getStackDepotSize(); + EXPECT_GT(StackDepotSize, 0u); + auto *StackDepotAddress = Allocator->getStackDepotAddress(); + EXPECT_NE(nullptr, StackDepotAddress); + + // Enable tracking again to verify that the initialization only happens once. + Allocator->setTrackAllocationStacks(true); + EXPECT_EQ(StackDepotSize, Allocator->getStackDepotSize()); + EXPECT_EQ(StackDepotAddress, Allocator->getStackDepotAddress()); } SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepotSize) { auto *Allocator = this->Allocator.get(); - auto Size = Allocator->getStackDepotSize(); - ASSERT_GT(Size, 0u); - EXPECT_EQ(Allocator->getStackDepotAddress()[Size - 1], '\0'); + Allocator->setTrackAllocationStacks(true); + + auto StackDepotSize = Allocator->getStackDepotSize(); + EXPECT_GT(StackDepotSize, 0u); + EXPECT_EQ(Allocator->getStackDepotAddress()[StackDepotSize - 1], '\0'); } SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepotAddress) { auto *Allocator = this->Allocator.get(); - auto *Addr = Allocator->getStackDepotAddress(); - EXPECT_NE(Addr, nullptr); - EXPECT_EQ(Addr, Allocator->getStackDepotAddress()); + Allocator->setTrackAllocationStacks(true); + + auto *StackDepotAddress = Allocator->getStackDepotAddress(); + EXPECT_NE(StackDepotAddress, nullptr); + EXPECT_EQ(StackDepotAddress, Allocator->getStackDepotAddress()); } SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepot) { diff --git a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp index 17a596d712d0c..abb81803f65ee 100644 --- a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp @@ -129,6 +129,7 @@ TEST(ScudoStringsTest, Padding) { } #if defined(__linux__) + #include TEST(ScudoStringsTest, CapacityIncreaseFails) { @@ -136,9 +137,19 @@ TEST(ScudoStringsTest, CapacityIncreaseFails) { rlimit Limit = {}; EXPECT_EQ(0, getrlimit(RLIMIT_AS, &Limit)); + rlimit EmptyLimit = {.rlim_cur = 0, .rlim_max = Limit.rlim_max}; EXPECT_EQ(0, setrlimit(RLIMIT_AS, &EmptyLimit)); + // qemu does not honor the setrlimit, so verify before proceeding. + scudo::MemMapT MemMap; + if (MemMap.map(/*Addr=*/0U, scudo::getPageSizeCached(), "scudo:test", + MAP_ALLOWNOMEM)) { + MemMap.unmap(MemMap.getBase(), MemMap.getCapacity()); + setrlimit(RLIMIT_AS, &Limit); + GTEST_SKIP() << "Limiting address space does not prevent mmap."; + } + // Test requires that the default length is at least 6 characters. scudo::uptr MaxSize = Str.capacity(); EXPECT_LE(6u, MaxSize); diff --git a/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp b/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp index add62c5a42a3e..b612676b7bd79 100644 --- a/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp @@ -58,6 +58,15 @@ TEST(ScudoVectorTest, ReallocateFails) { rlimit EmptyLimit = {.rlim_cur = 0, .rlim_max = Limit.rlim_max}; EXPECT_EQ(0, setrlimit(RLIMIT_AS, &EmptyLimit)); + // qemu does not honor the setrlimit, so verify before proceeding. + scudo::MemMapT MemMap; + if (MemMap.map(/*Addr=*/0U, scudo::getPageSizeCached(), "scudo:test", + MAP_ALLOWNOMEM)) { + MemMap.unmap(MemMap.getBase(), MemMap.getCapacity()); + setrlimit(RLIMIT_AS, &Limit); + GTEST_SKIP() << "Limiting address space does not prevent mmap."; + } + V.resize(capacity); // Set the last element so we can check it later. V.back() = '\0'; diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir index f4fe737e88d78..a2ffe555091eb 100644 --- a/flang/test/Transforms/stack-arrays.fir +++ b/flang/test/Transforms/stack-arrays.fir @@ -127,9 +127,7 @@ func.func @placement1() { return } // CHECK: func.func @placement1() { -// CHECK-NEXT: %[[ONE:.*]] = arith.constant 1 : index -// CHECK-NEXT: %[[TWO:.*]] = arith.constant 2 : index -// CHECK-NEXT: %[[ARG:.*]] = arith.addi %[[ONE]], %[[TWO]] : index +// CHECK-NEXT: %[[ARG:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[ARG]] // CHECK-NEXT: return // CHECK-NEXT: } @@ -204,13 +202,12 @@ func.func @placement4(%arg0 : i1) { // CHECK: func.func @placement4(%arg0: i1) { // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index // CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 -// CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index // CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index // CHECK-NEXT: cf.br ^bb1 // CHECK-NEXT: ^bb1: -// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index +// CHECK-NEXT: %[[C3:.*]] = arith.constant 3 : index // CHECK-NEXT: %[[SP:.*]] = fir.call @llvm.stacksave.p0() : () -> !fir.ref -// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[SUM]] +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[C3]] // CHECK-NEXT: fir.call @llvm.stackrestore.p0(%[[SP]]) : (!fir.ref) -> () // CHECK-NEXT: cf.cond_br %arg0, ^bb1, ^bb2 // CHECK-NEXT: ^bb2: diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 5bc0898298ce3..40a1cfda060e6 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -60,6 +60,15 @@ function(_get_common_compile_options output_var flags) if (LIBC_CC_SUPPORTS_PATTERN_INIT) list(APPEND compile_options "-ftrivial-auto-var-init=pattern") endif() + if (LIBC_CONF_KEEP_FRAME_POINTER) + list(APPEND compile_options "-fno-omit-frame-pointer") + if (LIBC_TARGET_ARCHITECTURE_IS_X86) + list(APPEND compile_options "-mno-omit-leaf-frame-pointer") + endif() + endif() + if (LIBC_CONF_ENABLE_STACK_PROTECTOR) + list(APPEND compile_options "-fstack-protector-strong") + endif() list(APPEND compile_options "-Wall") list(APPEND compile_options "-Wextra") # -DLIBC_WNO_ERROR=ON if you can't build cleanly with -Werror. diff --git a/libc/config/config.json b/libc/config/config.json index b73c47b1a14bc..d6ef891b9f260 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -30,5 +30,15 @@ "value": false, "doc": "Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled." } + }, + "codegen": { + "LIBC_CONF_KEEP_FRAME_POINTER": { + "value": true, + "doc": "Keep frame pointer in functions for better debugging experience." + }, + "LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR": { + "value": true, + "doc": "Enable -fstack-protector-strong to defend against stack smashing attack." + } } } diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index a177550647bd9..8f8c44caa1153 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -25,6 +25,9 @@ See the main ``config/config.json``, and the platform and architecture specific overrides in ``config//config.json`` and ``config///config.json,`` to learn about the defaults for your platform and target. +* **"codegen" options** + - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack. + - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience. * **"printf" options** - ``LIBC_CONF_PRINTF_DISABLE_FIXED_POINT``: Disable printing fixed point values in printf and friends. - ``LIBC_CONF_PRINTF_DISABLE_FLOAT``: Disable printing floating point values in printf and friends. diff --git a/libc/docs/dev/printf_behavior.rst b/libc/docs/dev/printf_behavior.rst index 9548bfda57aa7..c8b8ad45e987d 100644 --- a/libc/docs/dev/printf_behavior.rst +++ b/libc/docs/dev/printf_behavior.rst @@ -173,6 +173,10 @@ If a number passed as a min width or precision value is out of range for an int, then it will be treated as the largest or smallest value in the int range (e.g. "%-999999999999.999999999999s" is the same as "%-2147483648.2147483647s"). +If a number passed as a bit width is less than or equal to zero, the conversion +is considered invalid. If the provided bit width is larger than the width of +uintmax_t, it will be clamped to the width of uintmax_t. + ---------- Conversion ---------- diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst index 9d6d8099db951..e13a377f305c0 100644 --- a/libc/docs/gpu/rpc.rst +++ b/libc/docs/gpu/rpc.rst @@ -251,14 +251,10 @@ but the following example shows how it can be used by a standard user. __global__ void hello() { puts("Hello world!"); } int main() { - int device = 0; - // Initialize the RPC server to run on a single device. - if (rpc_status_t err = rpc_init(/*num_device=*/1)) - handle_error(err); - // Initialize the RPC server to run on the given device. + rpc_device_t device; if (rpc_status_t err = - rpc_server_init(device, RPC_MAXIMUM_PORT_COUNT, + rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT, /*warp_size=*/32, alloc_host, /*data=*/nullptr)) handle_error(err); @@ -277,6 +273,7 @@ but the following example shows how it can be used by a standard user. hello<<<1, 1, 0, stream>>>(); // While the kernel is executing, check the RPC server for work to do. + // Requires non-blocking CUDA kernels but avoids a separate thread. while (cudaStreamQuery(stream) == cudaErrorNotReady) if (rpc_status_t err = rpc_handle_server(device)) handle_error(err); @@ -286,10 +283,6 @@ but the following example shows how it can be used by a standard user. rpc_server_shutdown(device, free_host, /*data=*/nullptr)) handle_error(err); - // Shut down the entire RPC server interface. - if (rpc_status_t err = rpc_shutdown()) - handle_error(err); - return EXIT_SUCCESS; } @@ -300,7 +293,7 @@ associated with relocatable device code linking. .. code-block:: sh - $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart -lcgpu \ + $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart -lcgpu-nvptx \ -Iinclude -L/lib -lllvmlibc_rpc_server \ -O3 -foffload-lto -o hello $> ./hello diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h index a47931bb33900..6e4156497618e 100644 --- a/libc/src/__support/FPUtil/BasicOperations.h +++ b/libc/src/__support/FPUtil/BasicOperations.h @@ -185,27 +185,28 @@ LIBC_INLINE int canonicalize(T &cx, const T &x) { // More precisely : // Exponent | Significand | Meaning // | Bits 63-62 | Bits 61-0 | - // All Ones | 00 | Zero | Pseudo Infinity, Value = Infinty + // All Ones | 00 | Zero | Pseudo Infinity, Value = SNaN // All Ones | 00 | Non-Zero | Pseudo NaN, Value = SNaN // All Ones | 01 | Anything | Pseudo NaN, Value = SNaN // | Bit 63 | Bits 62-0 | // All zeroes | One | Anything | Pseudo Denormal, Value = // | | | (−1)**s × m × 2**−16382 - // All Other | Zero | Anything | Unnormal, Value = - // Values | | | (−1)**s × m × 2**−16382 + // All Other | Zero | Anything | Unnormal, Value = SNaN + // Values | | | bool bit63 = sx.get_implicit_bit(); UInt128 mantissa = sx.get_explicit_mantissa(); bool bit62 = static_cast((mantissa & (1ULL << 62)) >> 62); int exponent = sx.get_biased_exponent(); if (exponent == 0x7FFF) { if (!bit63 && !bit62) { - if (mantissa == 0) - cx = FPBits::inf(sx.sign()).get_val(); - else { + if (mantissa == 0) { cx = FPBits::quiet_nan(sx.sign(), mantissa).get_val(); raise_except_if_required(FE_INVALID); return 1; } + cx = FPBits::quiet_nan(sx.sign(), mantissa).get_val(); + raise_except_if_required(FE_INVALID); + return 1; } else if (!bit63 && bit62) { cx = FPBits::quiet_nan(sx.sign(), mantissa).get_val(); raise_except_if_required(FE_INVALID); @@ -219,9 +220,11 @@ LIBC_INLINE int canonicalize(T &cx, const T &x) { cx = x; } else if (exponent == 0 && bit63) cx = FPBits::make_value(mantissa, 0).get_val(); - else if (exponent != 0 && !bit63) - cx = FPBits::make_value(mantissa, 0).get_val(); - else if (LIBC_UNLIKELY(sx.is_signaling_nan())) { + else if (exponent != 0 && !bit63) { + cx = FPBits::quiet_nan(sx.sign(), mantissa).get_val(); + raise_except_if_required(FE_INVALID); + return 1; + } else if (LIBC_UNLIKELY(sx.is_signaling_nan())) { cx = FPBits::quiet_nan(sx.sign(), sx.get_explicit_mantissa()).get_val(); raise_except_if_required(FE_INVALID); diff --git a/libc/src/stdio/printf_core/converter_utils.h b/libc/src/stdio/printf_core/converter_utils.h index 948fe816e9b76..a0e96a11be5bf 100644 --- a/libc/src/stdio/printf_core/converter_utils.h +++ b/libc/src/stdio/printf_core/converter_utils.h @@ -18,7 +18,9 @@ namespace LIBC_NAMESPACE { namespace printf_core { -LIBC_INLINE uintmax_t apply_length_modifier(uintmax_t num, LengthModifier lm) { +LIBC_INLINE uintmax_t apply_length_modifier(uintmax_t num, + LengthSpec length_spec) { + auto [lm, bw] = length_spec; switch (lm) { case LengthModifier::none: return num & cpp::numeric_limits::max(); @@ -40,6 +42,18 @@ LIBC_INLINE uintmax_t apply_length_modifier(uintmax_t num, LengthModifier lm) { return num & cpp::numeric_limits::max(); case LengthModifier::j: return num; // j is intmax, so no mask is necessary. + case LengthModifier::w: + case LengthModifier::wf: { + uintmax_t mask; + if (bw == 0) { + mask = 0; + } else if (bw < sizeof(uintmax_t) * CHAR_BIT) { + mask = (static_cast(1) << bw) - 1; + } else { + mask = UINTMAX_MAX; + } + return num & mask; + } } __builtin_unreachable(); } diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h index d3718b49d1b13..1e78f195a75e8 100644 --- a/libc/src/stdio/printf_core/core_structs.h +++ b/libc/src/stdio/printf_core/core_structs.h @@ -22,7 +22,12 @@ namespace printf_core { // These length modifiers match the length modifiers in the format string, which // is why they are formatted differently from the rest of the file. -enum class LengthModifier { hh, h, l, ll, j, z, t, L, none }; +enum class LengthModifier { hh, h, l, ll, j, z, t, L, w, wf, none }; + +struct LengthSpec { + LengthModifier lm; + size_t bit_width; +}; enum FormatFlags : uint8_t { LEFT_JUSTIFIED = 0x01, // - @@ -44,6 +49,7 @@ struct FormatSection { // Format Specifier Values FormatFlags flags = FormatFlags(0); LengthModifier length_modifier = LengthModifier::none; + size_t bit_width = 0; int min_width = 0; int precision = -1; @@ -66,6 +72,7 @@ struct FormatSection { if (!((static_cast(flags) == static_cast(other.flags)) && (min_width == other.min_width) && (precision == other.precision) && + (bit_width == other.bit_width) && (length_modifier == other.length_modifier) && (conv_name == other.conv_name))) return false; diff --git a/libc/src/stdio/printf_core/int_converter.h b/libc/src/stdio/printf_core/int_converter.h index 2efbf53d40938..496e7bd1a56d9 100644 --- a/libc/src/stdio/printf_core/int_converter.h +++ b/libc/src/stdio/printf_core/int_converter.h @@ -71,7 +71,6 @@ LIBC_INLINE int convert_int(Writer *writer, const FormatSection &to_conv) { uintmax_t num = static_cast(to_conv.conv_val_raw); bool is_negative = false; FormatFlags flags = to_conv.flags; - const char a = is_lower(to_conv.conv_name) ? 'a' : 'A'; // If the conversion is signed, then handle negative values. @@ -89,8 +88,8 @@ LIBC_INLINE int convert_int(Writer *writer, const FormatSection &to_conv) { ~(FormatFlags::FORCE_SIGN | FormatFlags::SPACE_PREFIX)); } - num = apply_length_modifier(num, to_conv.length_modifier); - + num = + apply_length_modifier(num, {to_conv.length_modifier, to_conv.bit_width}); cpp::array buf; auto str = details::num_to_strview(num, buf, to_conv.conv_name); if (!str) diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h index 0876116a0bac8..8e8c77e219fa4 100644 --- a/libc/src/stdio/printf_core/parser.h +++ b/libc/src/stdio/printf_core/parser.h @@ -150,10 +150,10 @@ template class Parser { } } - LengthModifier lm = parse_length_modifier(&cur_pos); - + auto [lm, bw] = parse_length_modifier(&cur_pos); section.length_modifier = lm; section.conv_name = str[cur_pos]; + section.bit_width = bw; switch (str[cur_pos]) { case ('%'): // Regardless of options, a % conversion is always safe. The standard @@ -202,6 +202,21 @@ template class Parser { WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, ptrdiff_t, conv_index); break; + + case (LengthModifier::w): + case (LengthModifier::wf): + if (bw == 0) { + section.has_conv = false; + } else if (bw <= INT_WIDTH) { + WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, int, conv_index); + } else if (bw <= LONG_WIDTH) { + WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, long, conv_index); + } else if (bw <= LLONG_WIDTH) { + WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, long long, conv_index); + } else { + WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, intmax_t, conv_index); + } + break; } break; #ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT @@ -306,38 +321,54 @@ template class Parser { // assumes that str[*local_pos] is inside a format specifier. It returns a // LengthModifier with the length modifier it found. It will advance local_pos // after the format specifier if one is found. - LIBC_INLINE LengthModifier parse_length_modifier(size_t *local_pos) { + LIBC_INLINE LengthSpec parse_length_modifier(size_t *local_pos) { switch (str[*local_pos]) { case ('l'): if (str[*local_pos + 1] == 'l') { *local_pos += 2; - return LengthModifier::ll; + return {LengthModifier::ll, 0}; + } else { + ++*local_pos; + return {LengthModifier::l, 0}; + } + case ('w'): { + LengthModifier lm; + if (str[*local_pos + 1] == 'f') { + *local_pos += 2; + lm = LengthModifier::wf; } else { ++*local_pos; - return LengthModifier::l; + lm = LengthModifier::w; } + if (internal::isdigit(str[*local_pos])) { + const auto result = internal::strtointeger(str + *local_pos, 10); + *local_pos += result.parsed_len; + return {lm, static_cast(cpp::max(0, result.value))}; + } + return {lm, 0}; + } case ('h'): if (str[*local_pos + 1] == 'h') { *local_pos += 2; - return LengthModifier::hh; + return {LengthModifier::hh, 0}; } else { ++*local_pos; - return LengthModifier::h; + return {LengthModifier::h, 0}; } case ('L'): ++*local_pos; - return LengthModifier::L; + return {LengthModifier::L, 0}; case ('j'): ++*local_pos; - return LengthModifier::j; + return {LengthModifier::j, 0}; case ('z'): ++*local_pos; - return LengthModifier::z; + return {LengthModifier::z, 0}; case ('t'): ++*local_pos; - return LengthModifier::t; + return {LengthModifier::t, 0}; default: - return LengthModifier::none; + return {LengthModifier::none, 0}; } } @@ -509,7 +540,7 @@ template class Parser { } } - LengthModifier lm = parse_length_modifier(&local_pos); + auto [lm, bw] = parse_length_modifier(&local_pos); // if we don't have an index for this conversion, then its position is // unknown and all this information is irrelevant. The rest of this @@ -560,6 +591,18 @@ template class Parser { case (LengthModifier::t): conv_size = type_desc_from_type(); break; + case (LengthModifier::w): + case (LengthModifier::wf): + if (bw <= INT_WIDTH) { + conv_size = type_desc_from_type(); + } else if (bw <= LONG_WIDTH) { + conv_size = type_desc_from_type(); + } else if (bw <= LLONG_WIDTH) { + conv_size = type_desc_from_type(); + } else { + conv_size = type_desc_from_type(); + } + break; } break; #ifndef LIBC_COPT_PRINTF_DISABLE_FLOAT diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h index 0310905f36f14..18aa5c79897ec 100644 --- a/libc/src/stdio/printf_core/write_int_converter.h +++ b/libc/src/stdio/printf_core/write_int_converter.h @@ -55,6 +55,8 @@ LIBC_INLINE int convert_write_int(Writer *writer, *reinterpret_cast(to_conv.conv_val_ptr) = written; break; case LengthModifier::j: + case LengthModifier::w: + case LengthModifier::wf: *reinterpret_cast(to_conv.conv_val_ptr) = written; break; } diff --git a/libc/test/UnitTest/PrintfMatcher.cpp b/libc/test/UnitTest/PrintfMatcher.cpp index 32f3be73307e3..c8303815c9229 100644 --- a/libc/test/UnitTest/PrintfMatcher.cpp +++ b/libc/test/UnitTest/PrintfMatcher.cpp @@ -39,6 +39,10 @@ namespace { case (LengthModifier::lm): \ tlog << #lm; \ break +#define CASE_LM_BIT_WIDTH(lm, bw) \ + case (LengthModifier::lm): \ + tlog << #lm << "\n\tbit width: :" << bw; \ + break static void display(FormatSection form) { tlog << "Raw String (len " << form.raw_string.size() << "): \""; @@ -67,6 +71,8 @@ static void display(FormatSection form) { CASE_LM(z); CASE_LM(t); CASE_LM(L); + CASE_LM_BIT_WIDTH(w, form.bit_width); + CASE_LM_BIT_WIDTH(wf, form.bit_width); } tlog << "\n"; tlog << "\tconversion name: " << form.conv_name << "\n"; diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 3b756127fe21e..ae2cbad7d5a7d 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -178,6 +178,7 @@ add_fp_unittest( libc.src.math.canonicalize libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fenv_impl + libc.src.__support.integer_literals ) add_fp_unittest( @@ -193,6 +194,7 @@ add_fp_unittest( libc.src.math.canonicalizef libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fenv_impl + libc.src.__support.integer_literals ) add_fp_unittest( @@ -208,6 +210,7 @@ add_fp_unittest( libc.src.math.canonicalizef128 libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fenv_impl + libc.src.__support.integer_literals ) add_fp_unittest( @@ -223,6 +226,7 @@ add_fp_unittest( libc.src.math.canonicalizel libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fenv_impl + libc.src.__support.integer_literals ) add_fp_unittest( diff --git a/libc/test/src/math/smoke/CanonicalizeTest.h b/libc/test/src/math/smoke/CanonicalizeTest.h index c8af83f198388..4361f7d8ac7ab 100644 --- a/libc/test/src/math/smoke/CanonicalizeTest.h +++ b/libc/test/src/math/smoke/CanonicalizeTest.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_CANONICALIZETEST_H #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/integer_literals.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -22,6 +23,8 @@ #define TEST_REGULAR(x, y, expected) TEST_SPECIAL(x, y, expected, 0) +using LIBC_NAMESPACE::operator""_u128; + template class CanonicalizeTest : public LIBC_NAMESPACE::testing::Test { @@ -55,33 +58,31 @@ class CanonicalizeTest : public LIBC_NAMESPACE::testing::Test { T cx; // Exponent | Significand | Meaning // | Bits 63-62 | Bits 61-0 | - // All Ones | 00 | Zero | Pseudo Infinity, Value = Infinty - - FPBits test1((UInt128(0x7FFF) << 64) + UInt128(0x0000000000000000)); + // All Ones | 00 | Zero | Pseudo Infinity, Value = SNaN + FPBits test1(0x00000000'00007FFF'00000000'00000000_u128); const T test1_val = test1.get_val(); - TEST_SPECIAL(cx, test1_val, 0, 0); - EXPECT_FP_EQ(cx, inf); + TEST_SPECIAL(cx, test1_val, 1, FE_INVALID); + EXPECT_FP_EQ(cx, aNaN); // Exponent | Significand | Meaning // | Bits 63-62 | Bits 61-0 | // All Ones | 00 | Non-Zero | Pseudo NaN, Value = SNaN - - FPBits test2_1((UInt128(0x7FFF) << 64) + UInt128(0x0000000000000001)); + FPBits test2_1(0x00000000'00007FFF'00000000'00000001_u128); const T test2_1_val = test2_1.get_val(); TEST_SPECIAL(cx, test2_1_val, 1, FE_INVALID); EXPECT_FP_EQ(cx, aNaN); - FPBits test2_2((UInt128(0x7FFF) << 64) + UInt128(0x0000004270000001)); + FPBits test2_2(0x00000000'00007FFF'00000042'70000001_u128); const T test2_2_val = test2_2.get_val(); TEST_SPECIAL(cx, test2_2_val, 1, FE_INVALID); EXPECT_FP_EQ(cx, aNaN); - FPBits test2_3((UInt128(0x7FFF) << 64) + UInt128(0x0000000008261001)); + FPBits test2_3(0x00000000'00007FFF'00000000'08261001_u128); const T test2_3_val = test2_3.get_val(); TEST_SPECIAL(cx, test2_3_val, 1, FE_INVALID); EXPECT_FP_EQ(cx, aNaN); - FPBits test2_4((UInt128(0x7FFF) << 64) + UInt128(0x0000780008261001)); + FPBits test2_4(0x00000000'00007FFF'00007800'08261001_u128); const T test2_4_val = test2_4.get_val(); TEST_SPECIAL(cx, test2_4_val, 1, FE_INVALID); EXPECT_FP_EQ(cx, aNaN); @@ -89,23 +90,22 @@ class CanonicalizeTest : public LIBC_NAMESPACE::testing::Test { // Exponent | Significand | Meaning // | Bits 63-62 | Bits 61-0 | // All Ones | 01 | Anything | Pseudo NaN, Value = SNaN - - FPBits test3_1((UInt128(0x7FFF) << 64) + UInt128(0x4000000000000000)); + FPBits test3_1(0x00000000'00007FFF'40000000'00000000_u128); const T test3_1_val = test3_1.get_val(); TEST_SPECIAL(cx, test3_1_val, 1, FE_INVALID); EXPECT_FP_EQ(cx, aNaN); - FPBits test3_2((UInt128(0x7FFF) << 64) + UInt128(0x4000004270000001)); + FPBits test3_2(0x00000000'00007FFF'40000042'70000001_u128); const T test3_2_val = test3_2.get_val(); TEST_SPECIAL(cx, test3_2_val, 1, FE_INVALID); EXPECT_FP_EQ(cx, aNaN); - FPBits test3_3((UInt128(0x7FFF) << 64) + UInt128(0x4000000008261001)); + FPBits test3_3(0x00000000'00007FFF'40000000'08261001_u128); const T test3_3_val = test3_3.get_val(); TEST_SPECIAL(cx, test3_3_val, 1, FE_INVALID); EXPECT_FP_EQ(cx, aNaN); - FPBits test3_4((UInt128(0x7FFF) << 64) + UInt128(0x4007800008261001)); + FPBits test3_4(0x00000000'00007FFF'40007800'08261001_u128); const T test3_4_val = test3_4.get_val(); TEST_SPECIAL(cx, test3_4_val, 1, FE_INVALID); EXPECT_FP_EQ(cx, aNaN); @@ -114,20 +114,19 @@ class CanonicalizeTest : public LIBC_NAMESPACE::testing::Test { // | Bit 63 | Bits 62-0 | // All zeroes | One | Anything | Pseudo Denormal, Value = // | | | (−1)**s × m × 2**−16382 - - FPBits test4_1((UInt128(0x0000) << 64) + UInt128(0x8000000000000000)); + FPBits test4_1(0x00000000'00000000'80000000'00000000_u128); const T test4_1_val = test4_1.get_val(); TEST_SPECIAL(cx, test4_1_val, 0, 0); EXPECT_FP_EQ( cx, FPBits::make_value(test4_1.get_explicit_mantissa(), 0).get_val()); - FPBits test4_2((UInt128(0x0000) << 64) + UInt128(0x8000004270000001)); + FPBits test4_2(0x00000000'00000000'80000042'70000001_u128); const T test4_2_val = test4_2.get_val(); TEST_SPECIAL(cx, test4_2_val, 0, 0); EXPECT_FP_EQ( cx, FPBits::make_value(test4_2.get_explicit_mantissa(), 0).get_val()); - FPBits test4_3((UInt128(0x0000) << 64) + UInt128(0x8000000008261001)); + FPBits test4_3(0x00000000'00000000'80000000'08261001_u128); const T test4_3_val = test4_3.get_val(); TEST_SPECIAL(cx, test4_3_val, 0, 0); EXPECT_FP_EQ( @@ -135,44 +134,37 @@ class CanonicalizeTest : public LIBC_NAMESPACE::testing::Test { // Exponent | Significand | Meaning // | Bit 63 | Bits 62-0 | - // All Other | Zero | Anything | Unnormal, Value = - // Values | | | (−1)**s × m × 2**−16382 - - FPBits test5_1(UInt128(0x0000000000000001)); + // All Other | Zero | Anything | Unnormal, Value = SNaN + // Values | | | + FPBits test5_1(0x00000000'00000040'00000000'00000001_u128); const T test5_1_val = test5_1.get_val(); - TEST_SPECIAL(cx, test5_1_val, 0, 0); - EXPECT_FP_EQ( - cx, FPBits::make_value(test5_1.get_explicit_mantissa(), 0).get_val()); + TEST_SPECIAL(cx, test5_1_val, 1, FE_INVALID); + EXPECT_FP_EQ(cx, aNaN); - FPBits test5_2(UInt128(0x0000004270000001)); + FPBits test5_2(0x00000000'00000230'00000042'70000001_u128); const T test5_2_val = test5_2.get_val(); - TEST_SPECIAL(cx, test5_2_val, 0, 0); - EXPECT_FP_EQ( - cx, FPBits::make_value(test5_2.get_explicit_mantissa(), 0).get_val()); + TEST_SPECIAL(cx, test5_2_val, 1, FE_INVALID); + EXPECT_FP_EQ(cx, aNaN); - FPBits test5_3(UInt128(0x0000000008261001)); + FPBits test5_3(0x00000000'00000560'00000000'08261001_u128); const T test5_3_val = test5_3.get_val(); - TEST_SPECIAL(cx, test5_3_val, 0, 0); - EXPECT_FP_EQ( - cx, FPBits::make_value(test5_3.get_explicit_mantissa(), 0).get_val()); + TEST_SPECIAL(cx, test5_3_val, 1, FE_INVALID); + EXPECT_FP_EQ(cx, aNaN); - FPBits test5_4(UInt128(0x0000002816000000)); + FPBits test5_4(0x00000000'00000780'00000028'16000000_u128); const T test5_4_val = test5_4.get_val(); - TEST_SPECIAL(cx, test5_4_val, 0, 0); - EXPECT_FP_EQ( - cx, FPBits::make_value(test5_4.get_explicit_mantissa(), 0).get_val()); + TEST_SPECIAL(cx, test5_4_val, 1, FE_INVALID); + EXPECT_FP_EQ(cx, aNaN); - FPBits test5_5(UInt128(0x0000004270000001)); + FPBits test5_5(0x00000000'00000900'00000042'70000001_u128); const T test5_5_val = test5_5.get_val(); - TEST_SPECIAL(cx, test5_5_val, 0, 0); - EXPECT_FP_EQ( - cx, FPBits::make_value(test5_5.get_explicit_mantissa(), 0).get_val()); + TEST_SPECIAL(cx, test5_5_val, 1, FE_INVALID); + EXPECT_FP_EQ(cx, aNaN); - FPBits test5_6(UInt128(0x0000000008261001)); + FPBits test5_6(0x00000000'00000AB0'00000000'08261001_u128); const T test5_6_val = test5_6.get_val(); - TEST_SPECIAL(cx, test5_6_val, 0, 0); - EXPECT_FP_EQ( - cx, FPBits::make_value(test5_6.get_explicit_mantissa(), 0).get_val()); + TEST_SPECIAL(cx, test5_6_val, 1, FE_INVALID); + EXPECT_FP_EQ(cx, aNaN); } } diff --git a/libc/test/src/stdio/printf_core/parser_test.cpp b/libc/test/src/stdio/printf_core/parser_test.cpp index 0134277c4a1b2..66d6dd0a86c42 100644 --- a/libc/test/src/stdio/printf_core/parser_test.cpp +++ b/libc/test/src/stdio/printf_core/parser_test.cpp @@ -223,6 +223,42 @@ TEST(LlvmLibcPrintfParserTest, EvalOneArgWithLongLengthModifier) { ASSERT_PFORMAT_EQ(expected, format_arr[0]); } +TEST(LlvmLibcPrintfParserTest, EvalOneArgWithBitWidthLengthModifier) { + LIBC_NAMESPACE::printf_core::FormatSection format_arr[10]; + const char *str = "%w32d"; + long long arg1 = 12345; + evaluate(format_arr, str, arg1); + + LIBC_NAMESPACE::printf_core::FormatSection expected; + expected.has_conv = true; + + expected.raw_string = {str, 5}; + expected.length_modifier = LIBC_NAMESPACE::printf_core::LengthModifier::w; + expected.bit_width = 32; + expected.conv_val_raw = arg1; + expected.conv_name = 'd'; + + ASSERT_PFORMAT_EQ(expected, format_arr[0]); +} + +TEST(LlvmLibcPrintfParserTest, EvalOneArgWithFastBitWidthLengthModifier) { + LIBC_NAMESPACE::printf_core::FormatSection format_arr[10]; + const char *str = "%wf32d"; + long long arg1 = 12345; + evaluate(format_arr, str, arg1); + + LIBC_NAMESPACE::printf_core::FormatSection expected; + expected.has_conv = true; + + expected.raw_string = {str, 6}; + expected.length_modifier = LIBC_NAMESPACE::printf_core::LengthModifier::wf; + expected.bit_width = 32; + expected.conv_val_raw = arg1; + expected.conv_name = 'd'; + + ASSERT_PFORMAT_EQ(expected, format_arr[0]); +} + TEST(LlvmLibcPrintfParserTest, EvalOneArgWithAllOptions) { LIBC_NAMESPACE::printf_core::FormatSection format_arr[10]; const char *str = "% -056.78jd"; diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp index 8dde95d02a96d..8e9870f71a959 100644 --- a/libc/test/src/stdio/sprintf_test.cpp +++ b/libc/test/src/stdio/sprintf_test.cpp @@ -169,6 +169,93 @@ TEST(LlvmLibcSPrintfTest, IntConv) { EXPECT_EQ(written, 20); ASSERT_STREQ(buff, "-9223372036854775808"); // ll min + written = LIBC_NAMESPACE::sprintf(buff, "%w3d", 5807); + EXPECT_EQ(written, 1); + ASSERT_STREQ(buff, "7"); + + written = LIBC_NAMESPACE::sprintf(buff, "%w3d", 1); + EXPECT_EQ(written, 1); + ASSERT_STREQ(buff, "1"); + + written = LIBC_NAMESPACE::sprintf(buff, "%w64d", 9223372036854775807ll); + EXPECT_EQ(written, 19); + ASSERT_STREQ(buff, "9223372036854775807"); + + written = LIBC_NAMESPACE::sprintf(buff, "%w-1d", 5807); + EXPECT_EQ(written, 5); + ASSERT_STREQ(buff, "%w-1d"); + + written = LIBC_NAMESPACE::sprintf(buff, "%w0d", 5807); + EXPECT_EQ(written, 4); + ASSERT_STREQ(buff, "%w0d"); + + written = LIBC_NAMESPACE::sprintf(buff, "%w999d", 9223372036854775807ll); + EXPECT_EQ(written, 19); + ASSERT_STREQ(buff, "9223372036854775807"); + + written = LIBC_NAMESPACE::sprintf(buff, "%winvalid%w1d", 5807, 5807); + EXPECT_EQ(written, 10); + ASSERT_STREQ(buff, "%winvalid1"); + + written = LIBC_NAMESPACE::sprintf(buff, "%w-1d%w1d", 5807, 5807); + EXPECT_EQ(written, 6); + ASSERT_STREQ(buff, "%w-1d1"); + + char format[64]; + char uintmax[128]; + LIBC_NAMESPACE::sprintf(format, "%%w%du", sizeof(uintmax_t) * CHAR_BIT); + const int uintmax_len = + LIBC_NAMESPACE::sprintf(uintmax, "%ju", sizeof(uintmax_t) * CHAR_BIT); + written = LIBC_NAMESPACE::sprintf(buff, format, sizeof(uintmax_t) * CHAR_BIT); + EXPECT_EQ(written, uintmax_len); + ASSERT_STREQ(buff, uintmax); + + written = LIBC_NAMESPACE::sprintf(buff, "%w64u", 18446744073709551615ull); + EXPECT_EQ(written, 20); + ASSERT_STREQ(buff, "18446744073709551615"); // ull max + + written = + LIBC_NAMESPACE::sprintf(buff, "%w64d", -9223372036854775807ll - 1ll); + EXPECT_EQ(written, 20); + ASSERT_STREQ(buff, "-9223372036854775808"); // ll min + + written = LIBC_NAMESPACE::sprintf(buff, "%wf3d", 5807); + EXPECT_EQ(written, 1); + ASSERT_STREQ(buff, "7"); + + written = LIBC_NAMESPACE::sprintf(buff, "%wf3d", 1); + EXPECT_EQ(written, 1); + ASSERT_STREQ(buff, "1"); + + written = LIBC_NAMESPACE::sprintf(buff, "%wf64u", 18446744073709551615ull); + EXPECT_EQ(written, 20); + ASSERT_STREQ(buff, "18446744073709551615"); // ull max + + written = + LIBC_NAMESPACE::sprintf(buff, "%wf64d", -9223372036854775807ll - 1ll); + EXPECT_EQ(written, 20); + ASSERT_STREQ(buff, "-9223372036854775808"); // ll min + + written = LIBC_NAMESPACE::sprintf(buff, "%wf0d", 5807); + EXPECT_EQ(written, 5); + ASSERT_STREQ(buff, "%wf0d"); + + written = LIBC_NAMESPACE::sprintf(buff, "%wf-1d", 5807); + EXPECT_EQ(written, 6); + ASSERT_STREQ(buff, "%wf-1d"); + + written = LIBC_NAMESPACE::sprintf(buff, "%wfinvalid%wf1d", 5807, 5807); + EXPECT_EQ(written, 11); + ASSERT_STREQ(buff, "%wfinvalid1"); + + written = LIBC_NAMESPACE::sprintf(buff, "%wf-1d%wf1d", 5807, 5807); + EXPECT_EQ(written, 7); + ASSERT_STREQ(buff, "%wf-1d1"); + + written = LIBC_NAMESPACE::sprintf(buff, "%wf999d", 9223372036854775807ll); + EXPECT_EQ(written, 19); + ASSERT_STREQ(buff, "9223372036854775807"); + // Min Width Tests. written = LIBC_NAMESPACE::sprintf(buff, "%4d", 789); diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h index 9338038370197..9c7d328930c23 100644 --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -108,11 +108,11 @@ inline void handle_error(rpc_status_t) { } template -inline void register_rpc_callbacks(uint32_t device_id) { +inline void register_rpc_callbacks(rpc_device_t device) { static_assert(lane_size == 32 || lane_size == 64, "Invalid Lane size"); // Register the ping test for the `libc` tests. rpc_register_callback( - device_id, static_cast(RPC_TEST_INCREMENT), + device, static_cast(RPC_TEST_INCREMENT), [](rpc_port_t port, void *data) { rpc_recv_and_send( port, @@ -125,7 +125,7 @@ inline void register_rpc_callbacks(uint32_t device_id) { // Register the interface test callbacks. rpc_register_callback( - device_id, static_cast(RPC_TEST_INTERFACE), + device, static_cast(RPC_TEST_INTERFACE), [](rpc_port_t port, void *data) { uint64_t cnt = 0; bool end_with_recv; @@ -207,7 +207,7 @@ inline void register_rpc_callbacks(uint32_t device_id) { // Register the stream test handler. rpc_register_callback( - device_id, static_cast(RPC_TEST_STREAM), + device, static_cast(RPC_TEST_STREAM), [](rpc_port_t port, void *data) { uint64_t sizes[lane_size] = {0}; void *dst[lane_size] = {nullptr}; diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp index e3911eda2bd82..35840c6910bd8 100644 --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -153,7 +153,8 @@ template hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, hsa_amd_memory_pool_t kernargs_pool, hsa_amd_memory_pool_t coarsegrained_pool, - hsa_queue_t *queue, const LaunchParameters ¶ms, + hsa_queue_t *queue, rpc_device_t device, + const LaunchParameters ¶ms, const char *kernel_name, args_t kernel_args) { // Look up the '_start' kernel in the loaded executable. hsa_executable_symbol_t symbol; @@ -162,10 +163,9 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, return err; // Register RPC callbacks for the malloc and free functions on HSA. - uint32_t device_id = 0; auto tuple = std::make_tuple(dev_agent, coarsegrained_pool); rpc_register_callback( - device_id, RPC_MALLOC, + device, RPC_MALLOC, [](rpc_port_t port, void *data) { auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void { auto &[dev_agent, pool] = *static_cast(data); @@ -182,7 +182,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, }, &tuple); rpc_register_callback( - device_id, RPC_FREE, + device, RPC_FREE, [](rpc_port_t port, void *data) { auto free_handler = [](rpc_buffer_t *buffer, void *) { if (hsa_status_t err = hsa_amd_memory_pool_free( @@ -284,12 +284,12 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable, while (hsa_signal_wait_scacquire( packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0, /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0) - if (rpc_status_t err = rpc_handle_server(device_id)) + if (rpc_status_t err = rpc_handle_server(device)) handle_error(err); // Handle the server one more time in case the kernel exited with a pending // send still in flight. - if (rpc_status_t err = rpc_handle_server(device_id)) + if (rpc_status_t err = rpc_handle_server(device)) handle_error(err); // Destroy the resources acquired to launch the kernel and return. @@ -342,8 +342,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); // Obtain a single agent for the device and host to use the HSA memory model. - uint32_t num_devices = 1; - uint32_t device_id = 0; hsa_agent_t dev_agent; hsa_agent_t host_agent; if (hsa_status_t err = get_agent(&dev_agent)) @@ -433,8 +431,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); // Set up the RPC server. - if (rpc_status_t err = rpc_init(num_devices)) - handle_error(err); auto tuple = std::make_tuple(dev_agent, finegrained_pool); auto rpc_alloc = [](uint64_t size, void *data) { auto &[dev_agent, finegrained_pool] = *static_cast(data); @@ -445,15 +441,16 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr); return dev_ptr; }; - if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT, + rpc_device_t device; + if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT, wavefront_size, rpc_alloc, &tuple)) handle_error(err); // Register callbacks for the RPC unit tests. if (wavefront_size == 32) - register_rpc_callbacks<32>(device_id); + register_rpc_callbacks<32>(device); else if (wavefront_size == 64) - register_rpc_callbacks<64>(device_id); + register_rpc_callbacks<64>(device); else handle_error("Invalid wavefront size"); @@ -483,10 +480,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); void *rpc_client_buffer; - if (hsa_status_t err = hsa_amd_memory_lock( - const_cast(rpc_get_client_buffer(device_id)), - rpc_get_client_size(), - /*agents=*/nullptr, 0, &rpc_client_buffer)) + if (hsa_status_t err = + hsa_amd_memory_lock(const_cast(rpc_get_client_buffer(device)), + rpc_get_client_size(), + /*agents=*/nullptr, 0, &rpc_client_buffer)) handle_error(err); // Copy the RPC client buffer to the address pointed to by the symbol. @@ -496,7 +493,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); if (hsa_status_t err = hsa_amd_memory_unlock( - const_cast(rpc_get_client_buffer(device_id)))) + const_cast(rpc_get_client_buffer(device)))) handle_error(err); if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host)) handle_error(err); @@ -549,13 +546,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, begin_args_t init_args = {argc, dev_argv, dev_envp}; if (hsa_status_t err = launch_kernel( dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, - single_threaded_params, "_begin.kd", init_args)) + device, single_threaded_params, "_begin.kd", init_args)) handle_error(err); start_args_t args = {argc, dev_argv, dev_envp, dev_ret}; - if (hsa_status_t err = - launch_kernel(dev_agent, executable, kernargs_pool, - coarsegrained_pool, queue, params, "_start.kd", args)) + if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, + coarsegrained_pool, queue, device, + params, "_start.kd", args)) handle_error(err); void *host_ret; @@ -575,11 +572,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, end_args_t fini_args = {ret}; if (hsa_status_t err = launch_kernel( dev_agent, executable, kernargs_pool, coarsegrained_pool, queue, - single_threaded_params, "_end.kd", fini_args)) + device, single_threaded_params, "_end.kd", fini_args)) handle_error(err); if (rpc_status_t err = rpc_server_shutdown( - device_id, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); }, + device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); }, nullptr)) handle_error(err); @@ -600,8 +597,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, if (hsa_status_t err = hsa_code_object_destroy(object)) handle_error(err); - if (rpc_status_t err = rpc_shutdown()) - handle_error(err); if (hsa_status_t err = hsa_shut_down()) handle_error(err); diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp index 5388f287063b7..1818932f0a966 100644 --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -154,8 +154,8 @@ Expected get_ctor_dtor_array(const void *image, const size_t size, template CUresult launch_kernel(CUmodule binary, CUstream stream, - const LaunchParameters ¶ms, const char *kernel_name, - args_t kernel_args) { + rpc_device_t rpc_device, const LaunchParameters ¶ms, + const char *kernel_name, args_t kernel_args) { // look up the '_start' kernel in the loaded module. CUfunction function; if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name)) @@ -175,11 +175,10 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, handle_error(err); // Register RPC callbacks for the malloc and free functions on HSA. - uint32_t device_id = 0; - register_rpc_callbacks<32>(device_id); + register_rpc_callbacks<32>(rpc_device); rpc_register_callback( - device_id, RPC_MALLOC, + rpc_device, RPC_MALLOC, [](rpc_port_t port, void *data) { auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void { CUstream memory_stream = *static_cast(data); @@ -197,7 +196,7 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, }, &memory_stream); rpc_register_callback( - device_id, RPC_FREE, + rpc_device, RPC_FREE, [](rpc_port_t port, void *data) { auto free_handler = [](rpc_buffer_t *buffer, void *data) { CUstream memory_stream = *static_cast(data); @@ -219,12 +218,12 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, // Wait until the kernel has completed execution on the device. Periodically // check the RPC client for work to be performed on the server. while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY) - if (rpc_status_t err = rpc_handle_server(device_id)) + if (rpc_status_t err = rpc_handle_server(rpc_device)) handle_error(err); // Handle the server one more time in case the kernel exited with a pending // send still in flight. - if (rpc_status_t err = rpc_handle_server(device_id)) + if (rpc_status_t err = rpc_handle_server(rpc_device)) handle_error(err); return CUDA_SUCCESS; @@ -235,7 +234,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, if (CUresult err = cuInit(0)) handle_error(err); // Obtain the first device found on the system. - uint32_t num_devices = 1; uint32_t device_id = 0; CUdevice device; if (CUresult err = cuDeviceGet(&device, device_id)) @@ -294,9 +292,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, if (CUresult err = cuMemsetD32(dev_ret, 0, 1)) handle_error(err); - if (rpc_status_t err = rpc_init(num_devices)) - handle_error(err); - uint32_t warp_size = 32; auto rpc_alloc = [](uint64_t size, void *) -> void * { void *dev_ptr; @@ -304,7 +299,8 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); return dev_ptr; }; - if (rpc_status_t err = rpc_server_init(device_id, RPC_MAXIMUM_PORT_COUNT, + rpc_device_t rpc_device; + if (rpc_status_t err = rpc_server_init(&rpc_device, RPC_MAXIMUM_PORT_COUNT, warp_size, rpc_alloc, nullptr)) handle_error(err); @@ -321,19 +317,20 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *))) handle_error(err); if (CUresult err = - cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(device_id), + cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(rpc_device), rpc_get_client_size())) handle_error(err); LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; begin_args_t init_args = {argc, dev_argv, dev_envp}; - if (CUresult err = launch_kernel(binary, stream, single_threaded_params, - "_begin", init_args)) + if (CUresult err = launch_kernel(binary, stream, rpc_device, + single_threaded_params, "_begin", init_args)) handle_error(err); start_args_t args = {argc, dev_argv, dev_envp, reinterpret_cast(dev_ret)}; - if (CUresult err = launch_kernel(binary, stream, params, "_start", args)) + if (CUresult err = + launch_kernel(binary, stream, rpc_device, params, "_start", args)) handle_error(err); // Copy the return value back from the kernel and wait. @@ -345,8 +342,8 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); end_args_t fini_args = {host_ret}; - if (CUresult err = launch_kernel(binary, stream, single_threaded_params, - "_end", fini_args)) + if (CUresult err = launch_kernel(binary, stream, rpc_device, + single_threaded_params, "_end", fini_args)) handle_error(err); // Free the memory allocated for the device. @@ -357,7 +354,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, if (CUresult err = cuMemFreeHost(dev_argv)) handle_error(err); if (rpc_status_t err = rpc_server_shutdown( - device_id, [](void *ptr, void *) { cuMemFreeHost(ptr); }, nullptr)) + rpc_device, [](void *ptr, void *) { cuMemFreeHost(ptr); }, nullptr)) handle_error(err); // Destroy the context and the loaded binary. @@ -365,7 +362,5 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); if (CUresult err = cuDevicePrimaryCtxRelease(device)) handle_error(err); - if (rpc_status_t err = rpc_shutdown()) - handle_error(err); return host_ret; } diff --git a/libc/utils/gpu/server/llvmlibc_rpc_server.h b/libc/utils/gpu/server/llvmlibc_rpc_server.h index b7f2a463b1f5c..b0cf2f916b385 100644 --- a/libc/utils/gpu/server/llvmlibc_rpc_server.h +++ b/libc/utils/gpu/server/llvmlibc_rpc_server.h @@ -27,10 +27,8 @@ typedef enum { RPC_STATUS_SUCCESS = 0x0, RPC_STATUS_CONTINUE = 0x1, RPC_STATUS_ERROR = 0x1000, - RPC_STATUS_OUT_OF_RANGE = 0x1001, - RPC_STATUS_UNHANDLED_OPCODE = 0x1002, - RPC_STATUS_INVALID_LANE_SIZE = 0x1003, - RPC_STATUS_NOT_INITIALIZED = 0x1004, + RPC_STATUS_UNHANDLED_OPCODE = 0x1001, + RPC_STATUS_INVALID_LANE_SIZE = 0x1002, } rpc_status_t; /// A struct containing an opaque handle to an RPC port. This is what allows the @@ -45,6 +43,11 @@ typedef struct rpc_buffer_s { uint64_t data[8]; } rpc_buffer_t; +/// An opaque handle to an RPC server that can be attached to a device. +typedef struct rpc_device_s { + uintptr_t handle; +} rpc_device_t; + /// A function used to allocate \p bytes for use by the RPC server and client. /// The memory should support asynchronous and atomic access from both the /// client and server. @@ -60,34 +63,28 @@ typedef void (*rpc_opcode_callback_ty)(rpc_port_t port, void *data); /// A callback function to use the port to receive or send a \p buffer. typedef void (*rpc_port_callback_ty)(rpc_buffer_t *buffer, void *data); -/// Initialize the rpc library for general use on \p num_devices. -rpc_status_t rpc_init(uint32_t num_devices); - -/// Shut down the rpc interface. -rpc_status_t rpc_shutdown(void); - -/// Initialize the server for a given device. -rpc_status_t rpc_server_init(uint32_t device_id, uint64_t num_ports, +/// Initialize the server for a given device and return it in \p device. +rpc_status_t rpc_server_init(rpc_device_t *rpc_device, uint64_t num_ports, uint32_t lane_size, rpc_alloc_ty alloc, void *data); /// Shut down the server for a given device. -rpc_status_t rpc_server_shutdown(uint32_t device_id, rpc_free_ty dealloc, +rpc_status_t rpc_server_shutdown(rpc_device_t rpc_device, rpc_free_ty dealloc, void *data); /// Queries the RPC clients at least once and performs server-side work if there /// are any active requests. Runs until all work on the server is completed. -rpc_status_t rpc_handle_server(uint32_t device_id); +rpc_status_t rpc_handle_server(rpc_device_t rpc_device); /// Register a callback to handle an opcode from the RPC client. The associated /// data must remain accessible as long as the user intends to handle the server /// with this callback. -rpc_status_t rpc_register_callback(uint32_t device_id, uint16_t opcode, +rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint16_t opcode, rpc_opcode_callback_ty callback, void *data); /// Obtain a pointer to a local client buffer that can be copied directly to the /// other process using the address stored at the rpc client symbol name. -const void *rpc_get_client_buffer(uint32_t device_id); +const void *rpc_get_client_buffer(rpc_device_t device); /// Returns the size of the client in bytes to be used for a memory copy. uint64_t rpc_get_client_size(); diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp index 46ad98fa02cc5..fd306642fdcc4 100644 --- a/libc/utils/gpu/server/rpc_server.cpp +++ b/libc/utils/gpu/server/rpc_server.cpp @@ -248,127 +248,75 @@ struct Device { std::unordered_map callback_data; }; -// A struct containing all the runtime state required to run the RPC server. -struct State { - State(uint32_t num_devices) - : num_devices(num_devices), devices(num_devices), reference_count(0u) {} - uint32_t num_devices; - std::vector> devices; - std::atomic_uint32_t reference_count; -}; - -static std::mutex startup_mutex; - -static State *state; - -rpc_status_t rpc_init(uint32_t num_devices) { - std::scoped_lock lock(startup_mutex); - if (!state) - state = new State(num_devices); - - if (state->reference_count == std::numeric_limits::max()) - return RPC_STATUS_ERROR; - - state->reference_count++; - - return RPC_STATUS_SUCCESS; -} - -rpc_status_t rpc_shutdown(void) { - if (state && state->reference_count-- == 1) - delete state; - - return RPC_STATUS_SUCCESS; -} - -rpc_status_t rpc_server_init(uint32_t device_id, uint64_t num_ports, +rpc_status_t rpc_server_init(rpc_device_t *rpc_device, uint64_t num_ports, uint32_t lane_size, rpc_alloc_ty alloc, void *data) { - if (!state) - return RPC_STATUS_NOT_INITIALIZED; - if (device_id >= state->num_devices) - return RPC_STATUS_OUT_OF_RANGE; + if (!rpc_device) + return RPC_STATUS_ERROR; if (lane_size != 1 && lane_size != 32 && lane_size != 64) return RPC_STATUS_INVALID_LANE_SIZE; - if (!state->devices[device_id]) { - uint64_t size = rpc::Server::allocation_size(lane_size, num_ports); - void *buffer = alloc(size, data); + uint64_t size = rpc::Server::allocation_size(lane_size, num_ports); + void *buffer = alloc(size, data); - if (!buffer) - return RPC_STATUS_ERROR; + if (!buffer) + return RPC_STATUS_ERROR; - state->devices[device_id] = - std::make_unique(lane_size, num_ports, buffer); - if (!state->devices[device_id]) - return RPC_STATUS_ERROR; - } + Device *device = new Device(lane_size, num_ports, buffer); + if (!device) + return RPC_STATUS_ERROR; + rpc_device->handle = reinterpret_cast(device); return RPC_STATUS_SUCCESS; } -rpc_status_t rpc_server_shutdown(uint32_t device_id, rpc_free_ty dealloc, +rpc_status_t rpc_server_shutdown(rpc_device_t rpc_device, rpc_free_ty dealloc, void *data) { - if (!state) - return RPC_STATUS_NOT_INITIALIZED; - if (device_id >= state->num_devices) - return RPC_STATUS_OUT_OF_RANGE; - if (!state->devices[device_id]) + if (!rpc_device.handle) return RPC_STATUS_ERROR; - dealloc(state->devices[device_id]->buffer, data); - if (state->devices[device_id]) - state->devices[device_id].release(); + Device *device = reinterpret_cast(rpc_device.handle); + dealloc(device->buffer, data); + delete device; return RPC_STATUS_SUCCESS; } -rpc_status_t rpc_handle_server(uint32_t device_id) { - if (!state) - return RPC_STATUS_NOT_INITIALIZED; - if (device_id >= state->num_devices) - return RPC_STATUS_OUT_OF_RANGE; - if (!state->devices[device_id]) +rpc_status_t rpc_handle_server(rpc_device_t rpc_device) { + if (!rpc_device.handle) return RPC_STATUS_ERROR; + Device *device = reinterpret_cast(rpc_device.handle); uint32_t index = 0; for (;;) { - Device &device = *state->devices[device_id]; - rpc_status_t status = device.handle_server(index); + rpc_status_t status = device->handle_server(index); if (status != RPC_STATUS_CONTINUE) return status; } } -rpc_status_t rpc_register_callback(uint32_t device_id, uint16_t opcode, +rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint16_t opcode, rpc_opcode_callback_ty callback, void *data) { - if (!state) - return RPC_STATUS_NOT_INITIALIZED; - if (device_id >= state->num_devices) - return RPC_STATUS_OUT_OF_RANGE; - if (!state->devices[device_id]) + if (!rpc_device.handle) return RPC_STATUS_ERROR; - state->devices[device_id]->callbacks[opcode] = callback; - state->devices[device_id]->callback_data[opcode] = data; + Device *device = reinterpret_cast(rpc_device.handle); + + device->callbacks[opcode] = callback; + device->callback_data[opcode] = data; return RPC_STATUS_SUCCESS; } -const void *rpc_get_client_buffer(uint32_t device_id) { - if (!state || device_id >= state->num_devices || !state->devices[device_id]) +const void *rpc_get_client_buffer(rpc_device_t rpc_device) { + if (!rpc_device.handle) return nullptr; - return &state->devices[device_id]->client; + Device *device = reinterpret_cast(rpc_device.handle); + return &device->client; } uint64_t rpc_get_client_size() { return sizeof(rpc::Client); } -using ServerPort = std::variant; - -ServerPort get_port(rpc_port_t ref) { - return reinterpret_cast(ref.handle); -} - void rpc_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) { auto port = reinterpret_cast(ref.handle); port->send([=](rpc::Buffer *buffer) { diff --git a/libcxx/benchmarks/algorithms/mismatch.bench.cpp b/libcxx/benchmarks/algorithms/mismatch.bench.cpp index 9274932a764c5..06289068bb049 100644 --- a/libcxx/benchmarks/algorithms/mismatch.bench.cpp +++ b/libcxx/benchmarks/algorithms/mismatch.bench.cpp @@ -10,6 +10,15 @@ #include #include +void BenchmarkSizes(benchmark::internal::Benchmark* Benchmark) { + Benchmark->DenseRange(1, 8); + for (size_t i = 16; i != 1 << 20; i *= 2) { + Benchmark->Arg(i - 1); + Benchmark->Arg(i); + Benchmark->Arg(i + 1); + } +} + // TODO: Look into benchmarking aligned and unaligned memory explicitly // (currently things happen to be aligned because they are malloced that way) template @@ -24,8 +33,8 @@ static void bm_mismatch(benchmark::State& state) { benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin())); } } -BENCHMARK(bm_mismatch)->DenseRange(1, 8)->Range(16, 1 << 20); -BENCHMARK(bm_mismatch)->DenseRange(1, 8)->Range(16, 1 << 20); -BENCHMARK(bm_mismatch)->DenseRange(1, 8)->Range(16, 1 << 20); +BENCHMARK(bm_mismatch)->Apply(BenchmarkSizes); +BENCHMARK(bm_mismatch)->Apply(BenchmarkSizes); +BENCHMARK(bm_mismatch)->Apply(BenchmarkSizes); BENCHMARK_MAIN(); diff --git a/libcxx/docs/DesignDocs/NodiscardPolicy.rst b/libcxx/docs/DesignDocs/NodiscardPolicy.rst new file mode 100644 index 0000000000000..afbb18b0096d7 --- /dev/null +++ b/libcxx/docs/DesignDocs/NodiscardPolicy.rst @@ -0,0 +1,42 @@ +=================================================== +Guidelines for applying ``[[nodiscard]]`` in libc++ +=================================================== + +Libc++ adds ``[[nodiscard]]`` to functions in a lot of places. The standards +committee has decided to not have a recommended practice where to put them, so +this document lists where ``[[nodiscard]]`` should be applied in libc++. + +When should ``[[nodiscard]]`` be added to functions? +==================================================== + +``[[nodiscard]]`` should be applied to functions + +- where discarding the return value is most likely a correctness issue. + For example a locking constructor in ``unique_lock``. + +- where discarding the return value likely points to the user wanting to do + something different. For example ``vector::empty()``, which probably should + have been ``vector::clear()``. + + This can help spotting bugs easily which otherwise may take a very long time + to find. + +- which return a constant. For example ``numeric_limits::min()``. +- which only observe a value. For example ``string::size()``. + + Code that discards values from these kinds of functions is dead code. It can + either be removed, or the programmer meant to do something different. + +- where discarding the value is most likely a misuse of the function. For + example ``find``. + + This protects programmers from assuming too much about how the internals of + a function work, making code more robust in the presence of future + optimizations. + +What should be done when adding ``[[nodiscard]]`` to a function? +================================================================ + +Applications of ``[[nodiscard]]`` are code like any other code, so we aim to +test them. This can be done with a ``.verify.cpp`` test. Many examples are +available. Just look for tests with the suffix ``.nodiscard.verify.cpp``. diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index aa1bd4b83b265..2a7e47dfe6d88 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -189,6 +189,7 @@ Design Documents DesignDocs/FeatureTestMacros DesignDocs/FileTimeType DesignDocs/HeaderRemovalPolicy + DesignDocs/NodiscardPolicy DesignDocs/NoexceptPolicy DesignDocs/PSTLIntegration DesignDocs/ThreadingSupportAPI diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 982b85e4e2d62..07b5e974eaf52 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -430,22 +430,27 @@ set(files __fwd/array.h __fwd/bit_reference.h __fwd/complex.h + __fwd/deque.h __fwd/format.h __fwd/fstream.h __fwd/functional.h __fwd/ios.h __fwd/istream.h __fwd/mdspan.h + __fwd/memory.h __fwd/memory_resource.h __fwd/ostream.h __fwd/pair.h + __fwd/queue.h __fwd/span.h __fwd/sstream.h + __fwd/stack.h __fwd/streambuf.h __fwd/string.h __fwd/string_view.h __fwd/subrange.h __fwd/tuple.h + __fwd/vector.h __hash_table __ios/fpos.h __iterator/access.h diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h index 4eb693a1f2e9d..d933a84cada9e 100644 --- a/libcxx/include/__algorithm/mismatch.h +++ b/libcxx/include/__algorithm/mismatch.h @@ -64,7 +64,10 @@ __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __ constexpr size_t __unroll_count = 4; constexpr size_t __vec_size = __native_vector_size<_Tp>; using __vec = __simd_vector<_Tp, __vec_size>; + if (!__libcpp_is_constant_evaluated()) { + auto __orig_first1 = __first1; + auto __last2 = __first2 + (__last1 - __first1); while (static_cast(__last1 - __first1) >= __unroll_count * __vec_size) [[__unlikely__]] { __vec __lhs[__unroll_count]; __vec __rhs[__unroll_count]; @@ -84,8 +87,32 @@ __mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred& __pred, _Proj1& __ __first1 += __unroll_count * __vec_size; __first2 += __unroll_count * __vec_size; } + + // check the remaining 0-3 vectors + while (static_cast(__last1 - __first1) >= __vec_size) { + if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2); + !std::__all_of(__cmp_res)) { + auto __offset = std::__find_first_not_set(__cmp_res); + return {__first1 + __offset, __first2 + __offset}; + } + __first1 += __vec_size; + __first2 += __vec_size; + } + + if (__last1 - __first1 == 0) + return {__first1, __first2}; + + // Check if we can load elements in front of the current pointer. If that's the case load a vector at + // (last - vector_size) to check the remaining elements + if (static_cast(__first1 - __orig_first1) >= __vec_size) { + __first1 = __last1 - __vec_size; + __first2 = __last2 - __vec_size; + auto __offset = + std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2)); + return {__first1 + __offset, __first2 + __offset}; + } // else loop over the elements individually } - // TODO: Consider vectorizing the tail + return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2); } diff --git a/libcxx/include/__format/container_adaptor.h b/libcxx/include/__format/container_adaptor.h index ec806ef16bf52..9f49ca03bf4f5 100644 --- a/libcxx/include/__format/container_adaptor.h +++ b/libcxx/include/__format/container_adaptor.h @@ -18,11 +18,11 @@ #include <__format/concepts.h> #include <__format/formatter.h> #include <__format/range_default_formatter.h> +#include <__fwd/queue.h> +#include <__fwd/stack.h> #include <__ranges/ref_view.h> #include <__type_traits/is_const.h> #include <__type_traits/maybe_const.h> -#include -#include _LIBCPP_BEGIN_NAMESPACE_STD diff --git a/libcxx/include/__fwd/deque.h b/libcxx/include/__fwd/deque.h new file mode 100644 index 0000000000000..fd2fb5bb4b8e9 --- /dev/null +++ b/libcxx/include/__fwd/deque.h @@ -0,0 +1,26 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___FWD_DEQUE_H +#define _LIBCPP___FWD_DEQUE_H + +#include <__config> +#include <__fwd/memory.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +template > +class _LIBCPP_TEMPLATE_VIS deque; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___FWD_DEQUE_H diff --git a/libcxx/include/__fwd/memory.h b/libcxx/include/__fwd/memory.h new file mode 100644 index 0000000000000..b9e151855ad7d --- /dev/null +++ b/libcxx/include/__fwd/memory.h @@ -0,0 +1,25 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___FWD_MEMORY_H +#define _LIBCPP___FWD_MEMORY_H + +#include <__config> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +template +class _LIBCPP_TEMPLATE_VIS allocator; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___FWD_MEMORY_H diff --git a/libcxx/include/__fwd/queue.h b/libcxx/include/__fwd/queue.h new file mode 100644 index 0000000000000..50d99ad9c29f4 --- /dev/null +++ b/libcxx/include/__fwd/queue.h @@ -0,0 +1,31 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___FWD_QUEUE_H +#define _LIBCPP___FWD_QUEUE_H + +#include <__config> +#include <__functional/operations.h> +#include <__fwd/deque.h> +#include <__fwd/vector.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +template > +class _LIBCPP_TEMPLATE_VIS queue; + +template , class _Compare = less > +class _LIBCPP_TEMPLATE_VIS priority_queue; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___FWD_QUEUE_H diff --git a/libcxx/include/__fwd/sstream.h b/libcxx/include/__fwd/sstream.h index e2d46fbe1d9bb..39a9c3faf1f80 100644 --- a/libcxx/include/__fwd/sstream.h +++ b/libcxx/include/__fwd/sstream.h @@ -10,6 +10,7 @@ #define _LIBCPP___FWD_SSTREAM_H #include <__config> +#include <__fwd/memory.h> #include <__fwd/string.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__fwd/stack.h b/libcxx/include/__fwd/stack.h new file mode 100644 index 0000000000000..7dab6c1a4f4e2 --- /dev/null +++ b/libcxx/include/__fwd/stack.h @@ -0,0 +1,26 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___FWD_STACK_H +#define _LIBCPP___FWD_STACK_H + +#include <__config> +#include <__fwd/deque.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +template > +class _LIBCPP_TEMPLATE_VIS stack; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___FWD_STACK_H diff --git a/libcxx/include/__fwd/string.h b/libcxx/include/__fwd/string.h index 032132374de5e..320c4e4c81836 100644 --- a/libcxx/include/__fwd/string.h +++ b/libcxx/include/__fwd/string.h @@ -11,6 +11,7 @@ #include <__availability> #include <__config> +#include <__fwd/memory.h> #include <__fwd/memory_resource.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -39,9 +40,6 @@ template <> struct char_traits; #endif -template -class _LIBCPP_TEMPLATE_VIS allocator; - template , class _Allocator = allocator<_CharT> > class _LIBCPP_TEMPLATE_VIS basic_string; diff --git a/libcxx/include/__fwd/vector.h b/libcxx/include/__fwd/vector.h new file mode 100644 index 0000000000000..c9cc96137449f --- /dev/null +++ b/libcxx/include/__fwd/vector.h @@ -0,0 +1,26 @@ +//===---------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef _LIBCPP___FWD_VECTOR_H +#define _LIBCPP___FWD_VECTOR_H + +#include <__config> +#include <__fwd/memory.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +template > +class _LIBCPP_TEMPLATE_VIS vector; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___FWD_VECTOR_H diff --git a/libcxx/include/__type_traits/apply_cv.h b/libcxx/include/__type_traits/apply_cv.h index 7c6aabec8344b..723af95b8d928 100644 --- a/libcxx/include/__type_traits/apply_cv.h +++ b/libcxx/include/__type_traits/apply_cv.h @@ -10,9 +10,7 @@ #define _LIBCPP___TYPE_TRAITS_APPLY_CV_H #include <__config> -#include <__type_traits/is_const.h> -#include <__type_traits/is_volatile.h> -#include <__type_traits/remove_reference.h> +#include <__type_traits/copy_cv.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -20,54 +18,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template >::value, - bool = is_volatile<__libcpp_remove_reference_t<_Tp> >::value> -struct __apply_cv_impl { - template - using __apply _LIBCPP_NODEBUG = _Up; -}; - template -struct __apply_cv_impl<_Tp, true, false> { - template - using __apply _LIBCPP_NODEBUG = const _Up; -}; - -template -struct __apply_cv_impl<_Tp, false, true> { - template - using __apply _LIBCPP_NODEBUG = volatile _Up; -}; - -template -struct __apply_cv_impl<_Tp, true, true> { - template - using __apply _LIBCPP_NODEBUG = const volatile _Up; -}; - -template -struct __apply_cv_impl<_Tp&, false, false> { - template - using __apply _LIBCPP_NODEBUG = _Up&; -}; - -template -struct __apply_cv_impl<_Tp&, true, false> { - template - using __apply _LIBCPP_NODEBUG = const _Up&; -}; - -template -struct __apply_cv_impl<_Tp&, false, true> { +struct __apply_cv_impl { template - using __apply _LIBCPP_NODEBUG = volatile _Up&; + using __apply _LIBCPP_NODEBUG = __copy_cv_t<_Tp, _Up>; }; template -struct __apply_cv_impl<_Tp&, true, true> { +struct __apply_cv_impl<_Tp&> { template - using __apply _LIBCPP_NODEBUG = const volatile _Up&; + using __apply _LIBCPP_NODEBUG = __copy_cv_t<_Tp, _Up>&; }; template diff --git a/libcxx/include/deque b/libcxx/include/deque index 85ea9c6f661ed..a6472e46d426c 100644 --- a/libcxx/include/deque +++ b/libcxx/include/deque @@ -192,6 +192,7 @@ template #include <__availability> #include <__config> #include <__format/enable_insertable.h> +#include <__fwd/deque.h> #include <__iterator/distance.h> #include <__iterator/iterator_traits.h> #include <__iterator/next.h> @@ -244,9 +245,6 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -template > -class _LIBCPP_TEMPLATE_VIS deque; - template struct __deque_block_size { static const _DiffType value = sizeof(_ValueType) < 256 ? 4096 / sizeof(_ValueType) : 16; diff --git a/libcxx/include/format b/libcxx/include/format index 146613464534f..f1e87de0f8301 100644 --- a/libcxx/include/format +++ b/libcxx/include/format @@ -223,6 +223,8 @@ namespace std { #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include +# include +# include #endif #endif // _LIBCPP_FORMAT diff --git a/libcxx/include/iosfwd b/libcxx/include/iosfwd index f1c2cbd966967..9af5e05031850 100644 --- a/libcxx/include/iosfwd +++ b/libcxx/include/iosfwd @@ -110,6 +110,7 @@ using wosyncstream = basic_osyncstream; // C++20 #include <__fwd/fstream.h> #include <__fwd/ios.h> #include <__fwd/istream.h> +#include <__fwd/memory.h> #include <__fwd/ostream.h> #include <__fwd/sstream.h> #include <__fwd/streambuf.h> @@ -162,10 +163,6 @@ using wosyncstream = basic_osyncstream; #endif // _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_SYNCSTREAM) -// Include other forward declarations here -template > -class _LIBCPP_TEMPLATE_VIS vector; - template class __save_flags { typedef basic_ios<_CharT, _Traits> __stream_type; diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp index 56ea58262828a..fb446da5dc6fe 100644 --- a/libcxx/include/libcxx.imp +++ b/libcxx/include/libcxx.imp @@ -425,22 +425,27 @@ { include: [ "<__fwd/bit_reference.h>", "private", "", "public" ] }, { include: [ "<__fwd/bit_reference.h>", "private", "", "public" ] }, { include: [ "<__fwd/complex.h>", "private", "", "public" ] }, + { include: [ "<__fwd/deque.h>", "private", "", "public" ] }, { include: [ "<__fwd/format.h>", "private", "", "public" ] }, { include: [ "<__fwd/fstream.h>", "private", "", "public" ] }, { include: [ "<__fwd/functional.h>", "private", "", "public" ] }, { include: [ "<__fwd/ios.h>", "private", "", "public" ] }, { include: [ "<__fwd/istream.h>", "private", "", "public" ] }, { include: [ "<__fwd/mdspan.h>", "private", "", "public" ] }, + { include: [ "<__fwd/memory.h>", "private", "", "public" ] }, { include: [ "<__fwd/memory_resource.h>", "private", "", "public" ] }, { include: [ "<__fwd/ostream.h>", "private", "", "public" ] }, { include: [ "<__fwd/pair.h>", "private", "", "public" ] }, + { include: [ "<__fwd/queue.h>", "private", "", "public" ] }, { include: [ "<__fwd/span.h>", "private", "", "public" ] }, { include: [ "<__fwd/sstream.h>", "private", "", "public" ] }, + { include: [ "<__fwd/stack.h>", "private", "", "public" ] }, { include: [ "<__fwd/streambuf.h>", "private", "", "public" ] }, { include: [ "<__fwd/string.h>", "private", "", "public" ] }, { include: [ "<__fwd/string_view.h>", "private", "", "public" ] }, { include: [ "<__fwd/subrange.h>", "private", "", "public" ] }, { include: [ "<__fwd/tuple.h>", "private", "", "public" ] }, + { include: [ "<__fwd/vector.h>", "private", "", "public" ] }, { include: [ "<__ios/fpos.h>", "private", "", "public" ] }, { include: [ "<__iterator/access.h>", "private", "", "public" ] }, { include: [ "<__iterator/advance.h>", "private", "", "public" ] }, diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 03d18775631ed..079c6234d4105 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1255,6 +1255,8 @@ module std_private_debug_utils_strict_weak_ordering_check [system] { export std_private_type_traits_is_constant_evaluated } +module std_private_deque_fwd [system] { header "__fwd/deque.h" } + module std_private_exception_exception [system] { header "__exception/exception.h" } module std_private_exception_exception_ptr [system] { header "__exception/exception_ptr.h" @@ -1535,6 +1537,7 @@ module std_private_memory_concepts [system] { } module std_private_memory_construct_at [system] { header "__memory/construct_at.h" } module std_private_memory_destruct_n [system] { header "__memory/destruct_n.h" } +module std_private_memory_fwd [system] { header "__fwd/memory.h" } module std_private_memory_pointer_traits [system] { header "__memory/pointer_traits.h" } module std_private_memory_ranges_construct_at [system] { header "__memory/ranges_construct_at.h" } module std_private_memory_ranges_uninitialized_algorithms [system] { @@ -1600,6 +1603,8 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" } module std_private_numeric_transform_reduce [system] { header "__numeric/transform_reduce.h" } +module std_private_queue_fwd [system] { header "__fwd/queue.h" } + module std_private_random_bernoulli_distribution [system] { header "__random/bernoulli_distribution.h" } module std_private_random_binomial_distribution [system] { header "__random/binomial_distribution.h" } module std_private_random_cauchy_distribution [system] { header "__random/cauchy_distribution.h" } @@ -1737,6 +1742,8 @@ module std_private_ranges_zip_view [system] { header "__ranges module std_private_span_span_fwd [system] { header "__fwd/span.h" } +module std_private_stack_fwd [system] { header "__fwd/stack.h" } + module std_private_stop_token_atomic_unique_lock [system] { header "__stop_token/atomic_unique_lock.h" } module std_private_stop_token_intrusive_list_view [system] { header "__stop_token/intrusive_list_view.h" } module std_private_stop_token_intrusive_shared_ptr [system] { header "__stop_token/intrusive_shared_ptr.h" } @@ -2085,3 +2092,5 @@ module std_private_utility_to_underlying [system] { header "__utility/t module std_private_utility_unreachable [system] { header "__utility/unreachable.h" } module std_private_variant_monostate [system] { header "__variant/monostate.h" } + +module std_private_vector_fwd [system] { header "__fwd/vector.h" } diff --git a/libcxx/include/queue b/libcxx/include/queue index 521a465713cd2..f94cd7671863f 100644 --- a/libcxx/include/queue +++ b/libcxx/include/queue @@ -260,6 +260,8 @@ template #include <__algorithm/ranges_copy.h> #include <__config> #include <__functional/operations.h> +#include <__fwd/deque.h> +#include <__fwd/queue.h> #include <__iterator/back_insert_iterator.h> #include <__iterator/iterator_traits.h> #include <__memory/uses_allocator.h> @@ -287,9 +289,6 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -template > -class _LIBCPP_TEMPLATE_VIS queue; - template _LIBCPP_HIDE_FROM_ABI bool operator==(const queue<_Tp, _Container>& __x, const queue<_Tp, _Container>& __y); @@ -511,7 +510,7 @@ template struct _LIBCPP_TEMPLATE_VIS uses_allocator, _Alloc> : public uses_allocator<_Container, _Alloc> { }; -template , class _Compare = less > +template class _LIBCPP_TEMPLATE_VIS priority_queue { public: typedef _Container container_type; diff --git a/libcxx/include/stack b/libcxx/include/stack index 4003792600a00..08a392da6848d 100644 --- a/libcxx/include/stack +++ b/libcxx/include/stack @@ -115,6 +115,7 @@ template #include <__algorithm/ranges_copy.h> #include <__config> +#include <__fwd/stack.h> #include <__iterator/back_insert_iterator.h> #include <__iterator/iterator_traits.h> #include <__memory/uses_allocator.h> @@ -142,9 +143,6 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -template > -class _LIBCPP_TEMPLATE_VIS stack; - template _LIBCPP_HIDE_FROM_ABI bool operator==(const stack<_Tp, _Container>& __x, const stack<_Tp, _Container>& __y); diff --git a/libcxx/include/vector b/libcxx/include/vector index 0908482600c53..1defc43a52478 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -325,6 +325,7 @@ template requires is-vector-bool-reference // Since C++ #include <__format/formatter_bool.h> #include <__functional/hash.h> #include <__functional/unary_function.h> +#include <__fwd/vector.h> #include <__iterator/advance.h> #include <__iterator/distance.h> #include <__iterator/iterator_traits.h> @@ -357,7 +358,6 @@ template requires is-vector-bool-reference // Since C++ #include <__utility/swap.h> #include #include -#include // for forward declaration of vector #include #include #include @@ -2989,6 +2989,7 @@ _LIBCPP_POP_MACROS # include # include # include +# include # include # include # include diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv index c65b9b9d705e2..2e246644f626c 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx03.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv @@ -267,6 +267,7 @@ filesystem type_traits filesystem version format array format cctype +format cerrno format clocale format cmath format cstddef diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv index b3d9e327fc7aa..e074bf1f7dcc8 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx11.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv @@ -268,6 +268,7 @@ filesystem type_traits filesystem version format array format cctype +format cerrno format clocale format cmath format cstddef diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv index d723409422a3e..88f9c24f08646 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx14.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv @@ -270,6 +270,7 @@ filesystem type_traits filesystem version format array format cctype +format cerrno format clocale format cmath format cstddef diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv index d723409422a3e..88f9c24f08646 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx17.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv @@ -270,6 +270,7 @@ filesystem type_traits filesystem version format array format cctype +format cerrno format clocale format cmath format cstddef diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv index 03b4eda8b4d86..27f59660fb98d 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx20.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv @@ -281,6 +281,7 @@ filesystem type_traits filesystem version format array format cctype +format cerrno format clocale format cmath format cstddef diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv index 062127364adfe..79c67dc00cfb9 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx23.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv @@ -190,6 +190,7 @@ filesystem string_view filesystem version format array format cctype +format cerrno format clocale format cmath format cstddef @@ -201,8 +202,6 @@ format initializer_list format limits format new format optional -format queue -format stack format stdexcept format string format string_view @@ -680,7 +679,6 @@ vector cstdlib vector cstring vector cwchar vector initializer_list -vector iosfwd vector limits vector new vector stdexcept diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv index 062127364adfe..79c67dc00cfb9 100644 --- a/libcxx/test/libcxx/transitive_includes/cxx26.csv +++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv @@ -190,6 +190,7 @@ filesystem string_view filesystem version format array format cctype +format cerrno format clocale format cmath format cstddef @@ -201,8 +202,6 @@ format initializer_list format limits format new format optional -format queue -format stack format stdexcept format string format string_view @@ -680,7 +679,6 @@ vector cstdlib vector cstring vector cwchar vector initializer_list -vector iosfwd vector limits vector new vector stdexcept diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp index e7f3994d977dc..55c9eea863c3f 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp @@ -184,5 +184,33 @@ int main(int, char**) { } } + { // check the tail of the vectorized loop + for (size_t vec_size = 1; vec_size != 256; ++vec_size) { + { + std::vector lhs(256); + std::vector rhs(256); + + check(lhs, rhs, lhs.size()); + lhs.back() = 1; + check(lhs, rhs, lhs.size() - 1); + lhs.back() = 0; + rhs.back() = 1; + check(lhs, rhs, lhs.size() - 1); + rhs.back() = 0; + } + { + std::vector lhs(256); + std::vector rhs(256); + + check(lhs, rhs, lhs.size()); + lhs.back() = 1; + check(lhs, rhs, lhs.size() - 1); + lhs.back() = 0; + rhs.back() = 1; + check(lhs, rhs, lhs.size() - 1); + rhs.back() = 0; + } + } + } return 0; } diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.verify.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.verify.cpp index 7ce00d70f8442..2b2242e240a2c 100644 --- a/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.verify.cpp +++ b/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.verify.cpp @@ -14,25 +14,20 @@ // -> vector::value_type, Allocator>; // -#include -#include #include #include - - -int main(int, char**) -{ -// Test the explicit deduction guides - -// Test the implicit deduction guides - { -// vector (allocator &) - std::vector vec((std::allocator())); // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'vector'}} -// Note: The extra parens are necessary, since otherwise clang decides it is a function declaration. -// Also, we can't use {} instead of parens, because that constructs a -// deque, allocator>> - } - +#include + +int main(int, char**) { + // Test the explicit deduction guides + // TODO: Should there be tests for explicit deduction guides? + + // Test the implicit deduction guides + { + // vector (allocator &) + // expected-error@+1 {{no viable constructor or deduction guide for deduction of template arguments of 'vector'}} + std::vector vec(std::allocator< int>{}); + } return 0; } diff --git a/libcxx/test/support/deduction_guides_sfinae_checks.h b/libcxx/test/support/deduction_guides_sfinae_checks.h index 8b715da5a34e2..0c32b3732413a 100644 --- a/libcxx/test/support/deduction_guides_sfinae_checks.h +++ b/libcxx/test/support/deduction_guides_sfinae_checks.h @@ -16,6 +16,7 @@ #include #include #include +#include #include "test_macros.h" #if TEST_STD_VER >= 23 diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp index 725c6f166fffc..6529ea072fae2 100644 --- a/lld/ELF/InputFiles.cpp +++ b/lld/ELF/InputFiles.cpp @@ -1557,7 +1557,7 @@ template void SharedFile::parse() { Symbol *s = symtab.addSymbol( Undefined{this, name, sym.getBinding(), sym.st_other, sym.getType()}); s->exportDynamic = true; - if (s->isUndefined() && sym.getBinding() != STB_WEAK && + if (sym.getBinding() != STB_WEAK && config->unresolvedSymbolsInShlib != UnresolvedPolicy::Ignore) requiredSymbols.push_back(s); continue; diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s index 4b7151c8bc0d5..c69c1ea20ce3b 100644 --- a/lld/test/ELF/allow-shlib-undefined.s +++ b/lld/test/ELF/allow-shlib-undefined.s @@ -31,10 +31,12 @@ ## Test some cases when a relocatable object file provides a non-exported definition. # RUN: not ld.lld main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o def-hidden.o a.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED # RUN: not ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED # RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings -o /dev/null ## Test a relocatable object file definition that is converted to STB_LOCAL. # RUN: not ld.lld main.o a.so def-hidden.o --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o def-hidden.o a.so --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED ## The section containing the definition is discarded, and we report an error. # RUN: not ld.lld --gc-sections main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s diff --git a/lld/test/ELF/shlib-undefined-local.s b/lld/test/ELF/shlib-undefined-local.s index 8fceec1bf60ff..6d3e8da34e291 100644 --- a/lld/test/ELF/shlib-undefined-local.s +++ b/lld/test/ELF/shlib-undefined-local.s @@ -5,10 +5,9 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-linux-gnu -o %t2.o %s # RUN: echo "{ local: *; };" > %t.script -# RUN: ld.lld -version-script %t.script -o %t %t2.o %t.so -# RUN: llvm-nm -g %t | FileCheck -allow-empty %s +# RUN: not ld.lld -version-script %t.script %t2.o %t.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR -# CHECK-NOT: should_not_be_exported +# ERR: error: non-exported symbol 'should_not_be_exported' in '{{.*}}tmp2.o' is referenced by DSO '{{.*}}tmp.so' .globl should_not_be_exported should_not_be_exported: diff --git a/lldb/include/lldb/Utility/Scalar.h b/lldb/include/lldb/Utility/Scalar.h index 8e087a5ddeb85..d7155884c6d1b 100644 --- a/lldb/include/lldb/Utility/Scalar.h +++ b/lldb/include/lldb/Utility/Scalar.h @@ -71,6 +71,7 @@ class Scalar { : m_type(e_int), m_integer(std::move(v), false), m_float(0.0f) {} Scalar(llvm::APSInt v) : m_type(e_int), m_integer(std::move(v)), m_float(0.0f) {} + Scalar(llvm::APFloat v) : m_type(e_float), m_integer(0), m_float(v) {} bool SignExtend(uint32_t bit_pos); @@ -186,6 +187,10 @@ class Scalar { Status SetValueFromData(const DataExtractor &data, lldb::Encoding encoding, size_t byte_size); + llvm::APFloat CreateAPFloatFromAPSInt(lldb::BasicType basic_type); + + llvm::APFloat CreateAPFloatFromAPFloat(lldb::BasicType basic_type); + protected: Scalar::Type m_type = e_void; llvm::APSInt m_integer; diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp index 5ad68065bce1b..e94fd45962366 100644 --- a/lldb/source/Utility/Scalar.cpp +++ b/lldb/source/Utility/Scalar.cpp @@ -813,6 +813,48 @@ bool Scalar::ExtractBitfield(uint32_t bit_size, uint32_t bit_offset) { return false; } +llvm::APFloat Scalar::CreateAPFloatFromAPSInt(lldb::BasicType basic_type) { + switch (basic_type) { + case lldb::eBasicTypeFloat: + return llvm::APFloat( + m_integer.isSigned() + ? llvm::APIntOps::RoundSignedAPIntToFloat(m_integer) + : llvm::APIntOps::RoundAPIntToFloat(m_integer)); + case lldb::eBasicTypeDouble: + // No way to get more precision at the moment. + case lldb::eBasicTypeLongDouble: + return llvm::APFloat( + m_integer.isSigned() + ? llvm::APIntOps::RoundSignedAPIntToDouble(m_integer) + : llvm::APIntOps::RoundAPIntToDouble(m_integer)); + default: + const llvm::fltSemantics &sem = APFloat::IEEEsingle(); + return llvm::APFloat::getNaN(sem); + } +} + +llvm::APFloat Scalar::CreateAPFloatFromAPFloat(lldb::BasicType basic_type) { + switch (basic_type) { + case lldb::eBasicTypeFloat: { + bool loses_info; + m_float.convert(llvm::APFloat::IEEEsingle(), + llvm::APFloat::rmNearestTiesToEven, &loses_info); + return m_float; + } + case lldb::eBasicTypeDouble: + // No way to get more precision at the moment. + case lldb::eBasicTypeLongDouble: { + bool loses_info; + m_float.convert(llvm::APFloat::IEEEdouble(), + llvm::APFloat::rmNearestTiesToEven, &loses_info); + return m_float; + } + default: + const llvm::fltSemantics &sem = APFloat::IEEEsingle(); + return llvm::APFloat::getNaN(sem); + } +} + bool lldb_private::operator==(Scalar lhs, Scalar rhs) { // If either entry is void then we can just compare the types if (lhs.m_type == Scalar::e_void || rhs.m_type == Scalar::e_void) diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp index 29a4bcd356f11..8d957d16593ee 100644 --- a/lldb/unittests/Utility/ScalarTest.cpp +++ b/lldb/unittests/Utility/ScalarTest.cpp @@ -402,3 +402,61 @@ TEST(ScalarTest, TruncOrExtendTo) { S.TruncOrExtendTo(16, false); EXPECT_EQ(S.UInt128(APInt()), APInt(16, 0xffffu)); } + +TEST(ScalarTest, APFloatConstructor) { + llvm::APFloat my_single(llvm::APFloatBase::IEEEsingle(), "3.14159"); + llvm::APFloat my_double(llvm::APFloatBase::IEEEdouble(), "3.14159"); + Scalar S(my_single); + Scalar D(my_double); + + EXPECT_EQ(S.GetType(), Scalar::e_float); + EXPECT_EQ(D.GetType(), Scalar::e_float); + ASSERT_TRUE(S != D); +} + +TEST(ScalarTest, CreateAPFloats) { + llvm::APFloat ap_float(llvm::APFloatBase::IEEEsingle(), "3.14159"); + llvm::APFloat ap_nan = llvm::APFloat::getNaN(llvm::APFloat::IEEEsingle()); + llvm::APSInt int1("12"); + llvm::APSInt int2("-4"); + Scalar I1(int1); + Scalar I2(int2); + Scalar F(ap_float); + + llvm::APFloat out1_float = I1.CreateAPFloatFromAPSInt(lldb::eBasicTypeFloat); + llvm::APFloat out1_double = + I1.CreateAPFloatFromAPSInt(lldb::eBasicTypeDouble); + llvm::APFloat out1_longdouble = + I1.CreateAPFloatFromAPSInt(lldb::eBasicTypeLongDouble); + llvm::APFloat out1_nan = + I1.CreateAPFloatFromAPSInt(lldb::eBasicTypeFloatComplex); + EXPECT_TRUE(!out1_float.isNegative()); + EXPECT_TRUE(!out1_double.isNegative()); + EXPECT_TRUE(out1_double.bitwiseIsEqual(out1_longdouble)); + EXPECT_FALSE(out1_double.bitwiseIsEqual(out1_float)); + EXPECT_TRUE(out1_nan.bitwiseIsEqual(ap_nan)); + + llvm::APFloat out2_float = I2.CreateAPFloatFromAPSInt(lldb::eBasicTypeFloat); + llvm::APFloat out2_double = + I2.CreateAPFloatFromAPSInt(lldb::eBasicTypeDouble); + llvm::APFloat out2_longdouble = + I2.CreateAPFloatFromAPSInt(lldb::eBasicTypeLongDouble); + llvm::APFloat out2_nan = + I2.CreateAPFloatFromAPSInt(lldb::eBasicTypeFloatComplex); + EXPECT_TRUE(out2_float.isNegative()); + EXPECT_TRUE(out2_double.isNegative()); + EXPECT_TRUE(out2_double.bitwiseIsEqual(out2_longdouble)); + EXPECT_FALSE(out2_double.bitwiseIsEqual(out2_float)); + EXPECT_TRUE(out2_nan.bitwiseIsEqual(ap_nan)); + + llvm::APFloat out3_float = F.CreateAPFloatFromAPFloat(lldb::eBasicTypeFloat); + llvm::APFloat out3_double = + F.CreateAPFloatFromAPFloat(lldb::eBasicTypeDouble); + llvm::APFloat out3_longdouble = + F.CreateAPFloatFromAPFloat(lldb::eBasicTypeLongDouble); + llvm::APFloat out3_nan = + F.CreateAPFloatFromAPFloat(lldb::eBasicTypeFloatComplex); + EXPECT_TRUE(out3_double.bitwiseIsEqual(out3_longdouble)); + EXPECT_FALSE(out3_double.bitwiseIsEqual(out3_float)); + EXPECT_TRUE(out3_nan.bitwiseIsEqual(ap_nan)); +} diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 23694bf85d8c1..5d5d47f683227 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -11423,6 +11423,9 @@ Syntax: :: = trunc to ; yields ty2 + = trunc nsw to ; yields ty2 + = trunc nuw to ; yields ty2 + = trunc nuw nsw to ; yields ty2 Overview: """"""""" @@ -11446,6 +11449,11 @@ and converts the remaining bits to ``ty2``. Since the source size must be larger than the destination size, ``trunc`` cannot be a *no-op cast*. It will always truncate bits. +If the ``nuw`` keyword is present, and any of the truncated bits are zero, +the result is a :ref:`poison value `. If the ``nsw`` keyword +is present, and any of the truncated bits are not the same as the top bit +of the truncation result, the result is a :ref:`poison value `. + Example: """""""" diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index a790ced46f955..c6adbc436bb48 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -495,6 +495,13 @@ enum OverflowingBinaryOperatorOptionalFlags { OBO_NO_SIGNED_WRAP = 1 }; +/// TruncInstOptionalFlags - Flags for serializing +/// TruncInstOptionalFlags's SubclassOptionalData contents. +enum TruncInstOptionalFlags { + TIO_NO_UNSIGNED_WRAP = 0, + TIO_NO_SIGNED_WRAP = 1 +}; + /// FastMath Flags /// This is a fixed layout derived from the bitcode emitted by LLVM 5.0 /// intended to decouple the in-memory representation from the serialization. diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 9e8fc5d635c50..28d9cf6260d62 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -673,6 +673,14 @@ class CombinerHelper { bool matchSDivByConst(MachineInstr &MI); void applySDivByConst(MachineInstr &MI); + /// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant, + /// return expressions that implements it by shifting. + bool matchDivByPow2(MachineInstr &MI, bool IsSigned); + void applySDivByPow2(MachineInstr &MI); + /// Given an G_UDIV \p MI expressing an unsigned divided by a pow2 constant, + /// return expressions that implements it by shifting. + void applyUDivByPow2(MachineInstr &MI); + // G_UMULH x, (1 << c)) -> x >> (bitwidth - c) bool matchUMulHToLShr(MachineInstr &MI); void applyUMulHToLShr(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h index 0241ec4f2111d..807cec3c177d9 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -315,6 +315,10 @@ std::optional> ConstantFoldCountZeros(Register Src, const MachineRegisterInfo &MRI, std::function CB); +std::optional> +ConstantFoldICmp(unsigned Pred, const Register Op1, const Register Op2, + const MachineRegisterInfo &MRI); + /// Test if the given value is known to have exactly one bit set. This differs /// from computeKnownBits in that it doesn't necessarily determine which bit is /// set. diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 4e4cf71a349d7..4ffa6349871ba 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -5345,6 +5345,8 @@ class TruncInst : public CastInst { TruncInst *cloneImpl() const; public: + enum { AnyWrap = 0, NoUnsignedWrap = (1 << 0), NoSignedWrap = (1 << 1) }; + /// Constructor with insert-before-instruction semantics TruncInst( Value *S, ///< The value to be truncated @@ -5376,6 +5378,39 @@ class TruncInst : public CastInst { static bool classof(const Value *V) { return isa(V) && classof(cast(V)); } + + void setHasNoUnsignedWrap(bool B) { + SubclassOptionalData = + (SubclassOptionalData & ~NoUnsignedWrap) | (B * NoUnsignedWrap); + } + void setHasNoSignedWrap(bool B) { + SubclassOptionalData = + (SubclassOptionalData & ~NoSignedWrap) | (B * NoSignedWrap); + } + + /// Test whether this operation is known to never + /// undergo unsigned overflow, aka the nuw property. + bool hasNoUnsignedWrap() const { + return SubclassOptionalData & NoUnsignedWrap; + } + + /// Test whether this operation is known to never + /// undergo signed overflow, aka the nsw property. + bool hasNoSignedWrap() const { + return (SubclassOptionalData & NoSignedWrap) != 0; + } + + /// Returns the no-wrap kind of the operation. + unsigned getNoWrapKind() const { + unsigned NoWrapKind = 0; + if (hasNoUnsignedWrap()) + NoWrapKind |= NoUnsignedWrap; + + if (hasNoSignedWrap()) + NoWrapKind |= NoSignedWrap; + + return NoWrapKind; + } }; //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 6980cbd04aeb1..72d3c0ea69bcd 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -179,6 +179,7 @@ def FmArcp : MIFlagEnum<"FmArcp">; def FmContract : MIFlagEnum<"FmContract">; def FmAfn : MIFlagEnum<"FmAfn">; def FmReassoc : MIFlagEnum<"FmReassoc">; +def IsExact : MIFlagEnum<"IsExact">; def MIFlags; // def not; -> Already defined as a SDNode @@ -1036,7 +1037,20 @@ def sdiv_by_const : GICombineRule< [{ return Helper.matchSDivByConst(*${root}); }]), (apply [{ Helper.applySDivByConst(*${root}); }])>; -def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const]>; +def sdiv_by_pow2 : GICombineRule< + (defs root:$root), + (match (G_SDIV $dst, $x, $y, (MIFlags (not IsExact))):$root, + [{ return Helper.matchDivByPow2(*${root}, /*IsSigned=*/true); }]), + (apply [{ Helper.applySDivByPow2(*${root}); }])>; + +def udiv_by_pow2 : GICombineRule< + (defs root:$root), + (match (G_UDIV $dst, $x, $y, (MIFlags (not IsExact))):$root, + [{ return Helper.matchDivByPow2(*${root}, /*IsSigned=*/false); }]), + (apply [{ Helper.applyUDivByPow2(*${root}); }])>; + +def intdiv_combines : GICombineGroup<[udiv_by_const, sdiv_by_const, + sdiv_by_pow2, udiv_by_pow2]>; def reassoc_ptradd : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), diff --git a/llvm/include/llvm/TextAPI/DylibReader.h b/llvm/include/llvm/TextAPI/DylibReader.h index b556fbf6832a9..6861d3cb1591b 100644 --- a/llvm/include/llvm/TextAPI/DylibReader.h +++ b/llvm/include/llvm/TextAPI/DylibReader.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_DYLIBREADER_H #define LLVM_TEXTAPI_DYLIBREADER_H +#include "llvm/ADT/StringMap.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/TextAPI/ArchitectureSet.h" @@ -43,6 +44,14 @@ Expected readFile(MemoryBufferRef Buffer, const ParseOption &Opt); /// \param Buffer Data that points to dylib. Expected> get(MemoryBufferRef Buffer); +using SymbolToSourceLocMap = llvm::StringMap; +/// Get the source location for each symbol from dylib. +/// +/// \param DSYM Path to DSYM file. +/// \param T Requested target slice for dylib. +SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM, + const Target &T); + } // namespace llvm::MachO::DylibReader #endif // LLVM_TEXTAPI_DYLIBREADER_H diff --git a/llvm/include/llvm/TextAPI/Record.h b/llvm/include/llvm/TextAPI/Record.h index ef152ce433877..7d721988ec3da 100644 --- a/llvm/include/llvm/TextAPI/Record.h +++ b/llvm/include/llvm/TextAPI/Record.h @@ -27,6 +27,23 @@ LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); class RecordsSlice; +// Defines lightweight source location for records. +struct RecordLoc { + RecordLoc() = default; + RecordLoc(std::string File, unsigned Line) + : File(std::move(File)), Line(Line) {} + + /// Whether there is source location tied to the RecordLoc object. + bool isValid() const { return !File.empty(); } + + bool operator==(const RecordLoc &O) const { + return std::tie(File, Line) == std::tie(O.File, O.Line); + } + + const std::string File; + const unsigned Line = 0; +}; + // Defines a list of linkage types. enum class RecordLinkage : uint8_t { // Unknown linkage. diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h new file mode 100644 index 0000000000000..7ae6194da7c9c --- /dev/null +++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h @@ -0,0 +1,154 @@ +//===- Transforms/IPO/SampleProfileMatcher.h ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file provides the interface for SampleProfileMatcher. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_SAMPLEPROFILEMATCHER_H +#define LLVM_TRANSFORMS_IPO_SAMPLEPROFILEMATCHER_H + +#include "llvm/ADT/StringSet.h" +#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" + +namespace llvm { + +// Sample profile matching - fuzzy match. +class SampleProfileMatcher { + Module &M; + SampleProfileReader &Reader; + const PseudoProbeManager *ProbeManager; + const ThinOrFullLTOPhase LTOPhase; + SampleProfileMap FlattenedProfiles; + // For each function, the matcher generates a map, of which each entry is a + // mapping from the source location of current build to the source location in + // the profile. + StringMap FuncMappings; + + // Match state for an anchor/callsite. + enum class MatchState { + Unknown = 0, + // Initial match between input profile and current IR. + InitialMatch = 1, + // Initial mismatch between input profile and current IR. + InitialMismatch = 2, + // InitialMatch stays matched after fuzzy profile matching. + UnchangedMatch = 3, + // InitialMismatch stays mismatched after fuzzy profile matching. + UnchangedMismatch = 4, + // InitialMismatch is recovered after fuzzy profile matching. + RecoveredMismatch = 5, + // InitialMatch is removed and becomes mismatched after fuzzy profile + // matching. + RemovedMatch = 6, + }; + + // For each function, store every callsite and its matching state into this + // map, of which each entry is a pair of callsite location and MatchState. + // This is used for profile staleness computation and report. + StringMap> + FuncCallsiteMatchStates; + + // Profile mismatch statstics: + uint64_t TotalProfiledFunc = 0; + // Num of checksum-mismatched function. + uint64_t NumStaleProfileFunc = 0; + uint64_t TotalProfiledCallsites = 0; + uint64_t NumMismatchedCallsites = 0; + uint64_t NumRecoveredCallsites = 0; + // Total samples for all profiled functions. + uint64_t TotalFunctionSamples = 0; + // Total samples for all checksum-mismatched functions. + uint64_t MismatchedFunctionSamples = 0; + uint64_t MismatchedCallsiteSamples = 0; + uint64_t RecoveredCallsiteSamples = 0; + + // A dummy name for unknown indirect callee, used to differentiate from a + // non-call instruction that also has an empty callee name. + static constexpr const char *UnknownIndirectCallee = + "unknown.indirect.callee"; + +public: + SampleProfileMatcher(Module &M, SampleProfileReader &Reader, + const PseudoProbeManager *ProbeManager, + ThinOrFullLTOPhase LTOPhase) + : M(M), Reader(Reader), ProbeManager(ProbeManager), LTOPhase(LTOPhase){}; + void runOnModule(); + void clearMatchingData() { + // Do not clear FuncMappings, it stores IRLoc to ProfLoc remappings which + // will be used for sample loader. + FuncCallsiteMatchStates.clear(); + } + +private: + FunctionSamples *getFlattenedSamplesFor(const Function &F) { + StringRef CanonFName = FunctionSamples::getCanonicalFnName(F); + auto It = FlattenedProfiles.find(FunctionId(CanonFName)); + if (It != FlattenedProfiles.end()) + return &It->second; + return nullptr; + } + void runOnFunction(Function &F); + void findIRAnchors(const Function &F, + std::map &IRAnchors); + void findProfileAnchors( + const FunctionSamples &FS, + std::map> &ProfileAnchors); + // Record the callsite match states for profile staleness report, the result + // is saved in FuncCallsiteMatchStates. + void recordCallsiteMatchStates( + const Function &F, const std::map &IRAnchors, + const std::map> + &ProfileAnchors, + const LocToLocMap *IRToProfileLocationMap); + + bool isMismatchState(const enum MatchState &State) { + return State == MatchState::InitialMismatch || + State == MatchState::UnchangedMismatch || + State == MatchState::RemovedMatch; + }; + + bool isInitialState(const enum MatchState &State) { + return State == MatchState::InitialMatch || + State == MatchState::InitialMismatch; + }; + + bool isFinalState(const enum MatchState &State) { + return State == MatchState::UnchangedMatch || + State == MatchState::UnchangedMismatch || + State == MatchState::RecoveredMismatch || + State == MatchState::RemovedMatch; + }; + + // Count the samples of checksum mismatched function for the top-level + // function and all inlinees. + void countMismatchedFuncSamples(const FunctionSamples &FS, bool IsTopLevel); + // Count the number of mismatched or recovered callsites. + void countMismatchCallsites(const FunctionSamples &FS); + // Count the samples of mismatched or recovered callsites for top-level + // function and all inlinees. + void countMismatchedCallsiteSamples(const FunctionSamples &FS); + void computeAndReportProfileStaleness(); + + LocToLocMap &getIRToProfileLocationMap(const Function &F) { + auto Ret = FuncMappings.try_emplace( + FunctionSamples::getCanonicalFnName(F.getName()), LocToLocMap()); + return Ret.first->second; + } + void distributeIRToProfileLocationMap(); + void distributeIRToProfileLocationMap(FunctionSamples &FS); + void runStaleProfileMatching( + const Function &F, const std::map &IRAnchors, + const std::map> + &ProfileAnchors, + LocToLocMap &IRToProfileLocationMap); + void reportOrPersistProfileStats(); +}; +} // end namespace llvm +#endif // LLVM_TRANSFORMS_IPO_SAMPLEPROFILEMATCHER_H diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h index 048b97c34ee2a..d898ee58307ea 100644 --- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h @@ -146,6 +146,10 @@ class PseudoProbeManager { extern cl::opt SampleProfileUseProfi; +static inline bool skipProfileForFunction(const Function &F) { + return F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"); +} + template class SampleProfileLoaderBaseImpl { public: SampleProfileLoaderBaseImpl(std::string Name, std::string RemapName, diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index e7fcd51036968..e78147a312bff 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -7018,7 +7018,19 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB, Inst->setNonNeg(); return 0; } - case lltok::kw_trunc: + case lltok::kw_trunc: { + bool NUW = EatIfPresent(lltok::kw_nuw); + bool NSW = EatIfPresent(lltok::kw_nsw); + if (!NUW) + NUW = EatIfPresent(lltok::kw_nuw); + if (parseCast(Inst, PFS, KeywordVal)) + return true; + if (NUW) + cast(Inst)->setHasNoUnsignedWrap(true); + if (NSW) + cast(Inst)->setHasNoSignedWrap(true); + return false; + } case lltok::kw_sext: case lltok::kw_fptrunc: case lltok::kw_fpext: diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index d791c648e5211..598daf76d7113 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -5024,9 +5024,19 @@ Error BitcodeReader::parseFunctionBody(Function *F) { return error("Invalid cast"); I = CastInst::Create(CastOp, Op, ResTy); } - if (OpNum < Record.size() && isa(I) && - (Record[OpNum] & (1 << bitc::PNNI_NON_NEG))) - I->setNonNeg(true); + + if (OpNum < Record.size()) { + if (Opc == Instruction::ZExt) { + if (Record[OpNum] & (1 << bitc::PNNI_NON_NEG)) + cast(I)->setNonNeg(true); + } else if (Opc == Instruction::Trunc) { + if (Record[OpNum] & (1 << bitc::TIO_NO_UNSIGNED_WRAP)) + cast(I)->setHasNoUnsignedWrap(true); + if (Record[OpNum] & (1 << bitc::TIO_NO_SIGNED_WRAP)) + cast(I)->setHasNoSignedWrap(true); + } + } + InstructionList.push_back(I); break; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index a0e1c74452273..fed3f4bd535a2 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1648,6 +1648,11 @@ static uint64_t getOptimizationFlags(const Value *V) { } else if (const auto *NNI = dyn_cast(V)) { if (NNI->hasNonNeg()) Flags |= 1 << bitc::PNNI_NON_NEG; + } else if (const auto *TI = dyn_cast(V)) { + if (TI->hasNoSignedWrap()) + Flags |= 1 << bitc::TIO_NO_SIGNED_WRAP; + if (TI->hasNoUnsignedWrap()) + Flags |= 1 << bitc::TIO_NO_UNSIGNED_WRAP; } return Flags; diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 894285a7eb256..d5db79df68622 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -135,10 +135,13 @@ class AtomicExpandLegacy : public FunctionPass { // IRBuilder to be used for replacement atomic instructions. struct ReplacementIRBuilder : IRBuilder { // Preserves the DebugLoc from I, and preserves still valid metadata. + // Enable StrictFP builder mode when appropriate. explicit ReplacementIRBuilder(Instruction *I, const DataLayout &DL) : IRBuilder(I->getContext(), DL) { SetInsertPoint(I); this->CollectMetadataToCopy(I, {LLVMContext::MD_pcsections}); + if (BB->getParent()->getAttributes().hasFnAttr(Attribute::StrictFP)) + this->setIsFPConstrained(true); } }; diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp index a0bc325c6cda7..551ba1e6036c1 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -174,6 +174,20 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc, switch (Opc) { default: break; + case TargetOpcode::G_ICMP: { + assert(SrcOps.size() == 3 && "Invalid sources"); + assert(DstOps.size() == 1 && "Invalid dsts"); + LLT SrcTy = SrcOps[1].getLLTTy(*getMRI()); + + if (std::optional> Cst = + ConstantFoldICmp(SrcOps[0].getPredicate(), SrcOps[1].getReg(), + SrcOps[2].getReg(), *getMRI())) { + if (SrcTy.isVector()) + return buildBuildVectorConstant(DstOps[0], *Cst); + return buildConstant(DstOps[0], Cst->front()); + } + break; + } case TargetOpcode::G_ADD: case TargetOpcode::G_PTR_ADD: case TargetOpcode::G_AND: diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 2a521b6b068af..98e7c73a801f5 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -872,7 +872,6 @@ bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) { void CombinerHelper::applySextTruncSextLoad(MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG); - Builder.setInstrAndDebugLoc(MI); Builder.buildCopy(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); MI.eraseFromParent(); } @@ -1299,7 +1298,6 @@ bool CombinerHelper::matchCombineIndexedLoadStore( void CombinerHelper::applyCombineIndexedLoadStore( MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) { MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr); - Builder.setInstrAndDebugLoc(MI); unsigned Opcode = MI.getOpcode(); bool IsStore = Opcode == TargetOpcode::G_STORE; unsigned NewOpcode = getIndexedOpc(Opcode); @@ -1416,14 +1414,8 @@ void CombinerHelper::applyCombineDivRem(MachineInstr &MI, // deps by "moving" the instruction incorrectly. Also keep track of which // instruction is first so we pick it's operands, avoiding use-before-def // bugs. - MachineInstr *FirstInst; - if (dominates(MI, *OtherMI)) { - Builder.setInstrAndDebugLoc(MI); - FirstInst = &MI; - } else { - Builder.setInstrAndDebugLoc(*OtherMI); - FirstInst = OtherMI; - } + MachineInstr *FirstInst = dominates(MI, *OtherMI) ? &MI : OtherMI; + Builder.setInstrAndDebugLoc(*FirstInst); Builder.buildInstr(IsSigned ? TargetOpcode::G_SDIVREM : TargetOpcode::G_UDIVREM, @@ -1556,7 +1548,6 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI, void CombinerHelper::applyCombineConstantFoldFpUnary(MachineInstr &MI, const ConstantFP *Cst) { - Builder.setInstrAndDebugLoc(MI); APFloat Folded = constantFoldFpUnary(MI, MRI, Cst->getValue()); const ConstantFP *NewCst = ConstantFP::get(Builder.getContext(), Folded); Builder.buildFConstant(MI.getOperand(0), *NewCst); @@ -1691,7 +1682,6 @@ void CombinerHelper::applyShiftImmedChain(MachineInstr &MI, Opcode == TargetOpcode::G_USHLSAT) && "Expected G_SHL, G_ASHR, G_LSHR, G_SSHLSAT or G_USHLSAT"); - Builder.setInstrAndDebugLoc(MI); LLT Ty = MRI.getType(MI.getOperand(1).getReg()); unsigned const ScalarSizeInBits = Ty.getScalarSizeInBits(); auto Imm = MatchInfo.Imm; @@ -1807,7 +1797,6 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI, LLT ShlType = MRI.getType(MI.getOperand(2).getReg()); LLT DestType = MRI.getType(MI.getOperand(0).getReg()); - Builder.setInstrAndDebugLoc(MI); Register Const = Builder.buildConstant(ShlType, MatchInfo.ValSum).getReg(0); @@ -1943,7 +1932,6 @@ void CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI, int64_t ShiftAmtVal = MatchData.Imm; LLT ExtSrcTy = MRI.getType(ExtSrcReg); - Builder.setInstrAndDebugLoc(MI); auto ShiftAmt = Builder.buildConstant(ExtSrcTy, ShiftAmtVal); auto NarrowShift = Builder.buildShl(ExtSrcTy, ExtSrcReg, ShiftAmt, MI.getFlags()); @@ -2013,7 +2001,6 @@ void CombinerHelper::applyCombineUnmergeMergeToPlainValues( LLT SrcTy = MRI.getType(Operands[0]); LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); bool CanReuseInputDirectly = DstTy == SrcTy; - Builder.setInstrAndDebugLoc(MI); for (unsigned Idx = 0; Idx < NumElems; ++Idx) { Register DstReg = MI.getOperand(Idx).getReg(); Register SrcReg = Operands[Idx]; @@ -2066,7 +2053,6 @@ void CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI, assert((MI.getNumOperands() - 1 == Csts.size()) && "Not enough operands to replace all defs"); unsigned NumElems = MI.getNumOperands() - 1; - Builder.setInstrAndDebugLoc(MI); for (unsigned Idx = 0; Idx < NumElems; ++Idx) { Register DstReg = MI.getOperand(Idx).getReg(); Builder.buildConstant(DstReg, Csts[Idx]); @@ -2104,7 +2090,6 @@ bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) { } void CombinerHelper::applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) { - Builder.setInstrAndDebugLoc(MI); Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg(); Register Dst0Reg = MI.getOperand(0).getReg(); Builder.buildTrunc(Dst0Reg, SrcReg); @@ -2152,8 +2137,6 @@ void CombinerHelper::applyCombineUnmergeZExtToZExt(MachineInstr &MI) { LLT Dst0Ty = MRI.getType(Dst0Reg); LLT ZExtSrcTy = MRI.getType(ZExtSrcReg); - Builder.setInstrAndDebugLoc(MI); - if (Dst0Ty.getSizeInBits() > ZExtSrcTy.getSizeInBits()) { Builder.buildZExt(Dst0Reg, ZExtSrcReg); } else { @@ -2207,7 +2190,6 @@ void CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI, LLT HalfTy = LLT::scalar(HalfSize); - Builder.setInstr(MI); auto Unmerge = Builder.buildUnmerge(HalfTy, SrcReg); unsigned NarrowShiftAmt = ShiftVal - HalfSize; @@ -2292,7 +2274,6 @@ bool CombinerHelper::matchCombineI2PToP2I(MachineInstr &MI, Register &Reg) { void CombinerHelper::applyCombineI2PToP2I(MachineInstr &MI, Register &Reg) { assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR"); Register DstReg = MI.getOperand(0).getReg(); - Builder.setInstr(MI); Builder.buildCopy(DstReg, Reg); MI.eraseFromParent(); } @@ -2300,7 +2281,6 @@ void CombinerHelper::applyCombineI2PToP2I(MachineInstr &MI, Register &Reg) { void CombinerHelper::applyCombineP2IToI2P(MachineInstr &MI, Register &Reg) { assert(MI.getOpcode() == TargetOpcode::G_PTRTOINT && "Expected a G_PTRTOINT"); Register DstReg = MI.getOperand(0).getReg(); - Builder.setInstr(MI); Builder.buildZExtOrTrunc(DstReg, Reg); MI.eraseFromParent(); } @@ -2343,7 +2323,6 @@ void CombinerHelper::applyCombineAddP2IToPtrAdd( LLT PtrTy = MRI.getType(LHS); - Builder.setInstrAndDebugLoc(MI); auto PtrAdd = Builder.buildPtrAdd(PtrTy, LHS, RHS); Builder.buildPtrToInt(Dst, PtrAdd); MI.eraseFromParent(); @@ -2375,7 +2354,6 @@ void CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI, auto &PtrAdd = cast(MI); Register Dst = PtrAdd.getReg(0); - Builder.setInstrAndDebugLoc(MI); Builder.buildConstant(Dst, NewCst); PtrAdd.eraseFromParent(); } @@ -2455,7 +2433,6 @@ void CombinerHelper::applyCombineExtOfExt( (MI.getOpcode() == TargetOpcode::G_SEXT && SrcExtOp == TargetOpcode::G_ZEXT)) { Register DstReg = MI.getOperand(0).getReg(); - Builder.setInstrAndDebugLoc(MI); Builder.buildInstr(SrcExtOp, {DstReg}, {Reg}); MI.eraseFromParent(); } @@ -2488,7 +2465,6 @@ void CombinerHelper::applyCombineTruncOfExt( replaceRegWith(MRI, DstReg, SrcReg); return; } - Builder.setInstrAndDebugLoc(MI); if (SrcTy.getSizeInBits() < DstTy.getSizeInBits()) Builder.buildInstr(SrcExtOp, {DstReg}, {SrcReg}); else @@ -2576,8 +2552,6 @@ bool CombinerHelper::matchCombineTruncOfShift( void CombinerHelper::applyCombineTruncOfShift( MachineInstr &MI, std::pair &MatchInfo) { - Builder.setInstrAndDebugLoc(MI); - MachineInstr *ShiftMI = MatchInfo.first; LLT NewShiftTy = MatchInfo.second; @@ -2823,7 +2797,6 @@ void CombinerHelper::applyFunnelShiftConstantModulo(MachineInstr &MI) { APInt NewConst = VRegAndVal->Value.urem( APInt(ConstTy.getSizeInBits(), DstTy.getScalarSizeInBits())); - Builder.setInstrAndDebugLoc(MI); auto NewConstInstr = Builder.buildConstant(ConstTy, NewConst.getZExtValue()); Builder.buildInstr( MI.getOpcode(), {MI.getOperand(0)}, @@ -2866,35 +2839,31 @@ bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); - Builder.setInstr(MI); Builder.buildFConstant(MI.getOperand(0), C); MI.eraseFromParent(); } void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); - Builder.setInstr(MI); Builder.buildConstant(MI.getOperand(0), C); MI.eraseFromParent(); } void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); - Builder.setInstr(MI); Builder.buildConstant(MI.getOperand(0), C); MI.eraseFromParent(); } -void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, ConstantFP *CFP) { +void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, + ConstantFP *CFP) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); - Builder.setInstr(MI); Builder.buildFConstant(MI.getOperand(0), CFP->getValueAPF()); MI.eraseFromParent(); } void CombinerHelper::replaceInstWithUndef(MachineInstr &MI) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); - Builder.setInstr(MI); Builder.buildUndef(MI.getOperand(0)); MI.eraseFromParent(); } @@ -2962,7 +2931,6 @@ bool CombinerHelper::matchCombineInsertVecElts( void CombinerHelper::applyCombineInsertVecElts( MachineInstr &MI, SmallVectorImpl &MatchInfo) { - Builder.setInstr(MI); Register UndefReg; auto GetUndef = [&]() { if (UndefReg) @@ -2981,7 +2949,6 @@ void CombinerHelper::applyCombineInsertVecElts( void CombinerHelper::applySimplifyAddToSub( MachineInstr &MI, std::tuple &MatchInfo) { - Builder.setInstr(MI); Register SubLHS, SubRHS; std::tie(SubLHS, SubRHS) = MatchInfo; Builder.buildSub(MI.getOperand(0).getReg(), SubLHS, SubRHS); @@ -3084,7 +3051,6 @@ void CombinerHelper::applyBuildInstructionSteps( MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) { assert(MatchInfo.InstrsToBuild.size() && "Expected at least one instr to build?"); - Builder.setInstr(MI); for (auto &InstrToBuild : MatchInfo.InstrsToBuild) { assert(InstrToBuild.Opcode && "Expected a valid opcode?"); assert(InstrToBuild.OperandFns.size() && "Expected at least one operand?"); @@ -3120,7 +3086,6 @@ void CombinerHelper::applyAshShlToSextInreg( int64_t ShiftAmt; std::tie(Src, ShiftAmt) = MatchInfo; unsigned Size = MRI.getType(Src).getScalarSizeInBits(); - Builder.setInstrAndDebugLoc(MI); Builder.buildSExtInReg(MI.getOperand(0).getReg(), Src, Size - ShiftAmt); MI.eraseFromParent(); } @@ -3399,7 +3364,6 @@ bool CombinerHelper::matchXorOfAndWithSameReg( void CombinerHelper::applyXorOfAndWithSameReg( MachineInstr &MI, std::pair &MatchInfo) { // Fold (xor (and x, y), y) -> (and (not x), y) - Builder.setInstrAndDebugLoc(MI); Register X, Y; std::tie(X, Y) = MatchInfo; auto Not = Builder.buildNot(MRI.getType(X), X); @@ -3431,7 +3395,6 @@ bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) { void CombinerHelper::applyPtrAddZero(MachineInstr &MI) { auto &PtrAdd = cast(MI); - Builder.setInstrAndDebugLoc(PtrAdd); Builder.buildIntToPtr(PtrAdd.getReg(0), PtrAdd.getOffsetReg()); PtrAdd.eraseFromParent(); } @@ -3442,7 +3405,6 @@ void CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) { Register Src0 = MI.getOperand(1).getReg(); Register Pow2Src1 = MI.getOperand(2).getReg(); LLT Ty = MRI.getType(DstReg); - Builder.setInstrAndDebugLoc(MI); // Fold (urem x, pow2) -> (and x, pow2-1) auto NegOne = Builder.buildConstant(Ty, -1); @@ -3507,8 +3469,6 @@ bool CombinerHelper::matchFoldBinOpIntoSelect(MachineInstr &MI, /// to fold. void CombinerHelper::applyFoldBinOpIntoSelect(MachineInstr &MI, const unsigned &SelectOperand) { - Builder.setInstrAndDebugLoc(MI); - Register Dst = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -4029,7 +3989,6 @@ void CombinerHelper::applyExtractVecEltBuildVec(MachineInstr &MI, Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); - Builder.setInstrAndDebugLoc(MI); if (ScalarTy != DstTy) { assert(ScalarTy.getSizeInBits() > DstTy.getSizeInBits()); Builder.buildTrunc(DstReg, Reg); @@ -4095,14 +4054,12 @@ void CombinerHelper::applyExtractAllEltsFromBuildVector( void CombinerHelper::applyBuildFn( MachineInstr &MI, std::function &MatchInfo) { - Builder.setInstrAndDebugLoc(MI); - MatchInfo(Builder); + applyBuildFnNoErase(MI, MatchInfo); MI.eraseFromParent(); } void CombinerHelper::applyBuildFnNoErase( MachineInstr &MI, std::function &MatchInfo) { - Builder.setInstrAndDebugLoc(MI); MatchInfo(Builder); } @@ -4204,7 +4161,6 @@ void CombinerHelper::applyRotateOutOfRange(MachineInstr &MI) { MI.getOpcode() == TargetOpcode::G_ROTR); unsigned Bitsize = MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits(); - Builder.setInstrAndDebugLoc(MI); Register Amt = MI.getOperand(2).getReg(); LLT AmtTy = MRI.getType(Amt); auto Bits = Builder.buildConstant(AmtTy, Bitsize); @@ -5027,7 +4983,6 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) { LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType(); auto &MIB = Builder; - MIB.setInstrAndDebugLoc(MI); bool UseNPQ = false; SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; @@ -5213,7 +5168,6 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) { LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType(); auto &MIB = Builder; - MIB.setInstrAndDebugLoc(MI); bool UseSRA = false; SmallVector Shifts, Factors; @@ -5270,6 +5224,93 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) { return MIB.buildMul(Ty, Res, Factor); } +bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) { + assert((MI.getOpcode() == TargetOpcode::G_SDIV || + MI.getOpcode() == TargetOpcode::G_UDIV) && + "Expected SDIV or UDIV"); + auto &Div = cast(MI); + Register RHS = Div.getReg(2); + auto MatchPow2 = [&](const Constant *C) { + auto *CI = dyn_cast(C); + return CI && (CI->getValue().isPowerOf2() || + (IsSigned && CI->getValue().isNegatedPowerOf2())); + }; + return matchUnaryPredicate(MRI, RHS, MatchPow2, /*AllowUndefs=*/false); +} + +void CombinerHelper::applySDivByPow2(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); + auto &SDiv = cast(MI); + Register Dst = SDiv.getReg(0); + Register LHS = SDiv.getReg(1); + Register RHS = SDiv.getReg(2); + LLT Ty = MRI.getType(Dst); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + LLT CCVT = + Ty.isVector() ? LLT::vector(Ty.getElementCount(), 1) : LLT::scalar(1); + + // Effectively we want to lower G_SDIV %lhs, %rhs, where %rhs is a power of 2, + // to the following version: + // + // %c1 = G_CTTZ %rhs + // %inexact = G_SUB $bitwidth, %c1 + // %sign = %G_ASHR %lhs, $(bitwidth - 1) + // %lshr = G_LSHR %sign, %inexact + // %add = G_ADD %lhs, %lshr + // %ashr = G_ASHR %add, %c1 + // %ashr = G_SELECT, %isoneorallones, %lhs, %ashr + // %zero = G_CONSTANT $0 + // %neg = G_NEG %ashr + // %isneg = G_ICMP SLT %rhs, %zero + // %res = G_SELECT %isneg, %neg, %ashr + + unsigned BitWidth = Ty.getScalarSizeInBits(); + auto Zero = Builder.buildConstant(Ty, 0); + + auto Bits = Builder.buildConstant(ShiftAmtTy, BitWidth); + auto C1 = Builder.buildCTTZ(ShiftAmtTy, RHS); + auto Inexact = Builder.buildSub(ShiftAmtTy, Bits, C1); + // Splat the sign bit into the register + auto Sign = Builder.buildAShr( + Ty, LHS, Builder.buildConstant(ShiftAmtTy, BitWidth - 1)); + + // Add (LHS < 0) ? abs2 - 1 : 0; + auto LSrl = Builder.buildLShr(Ty, Sign, Inexact); + auto Add = Builder.buildAdd(Ty, LHS, LSrl); + auto AShr = Builder.buildAShr(Ty, Add, C1); + + // Special case: (sdiv X, 1) -> X + // Special Case: (sdiv X, -1) -> 0-X + auto One = Builder.buildConstant(Ty, 1); + auto MinusOne = Builder.buildConstant(Ty, -1); + auto IsOne = Builder.buildICmp(CmpInst::Predicate::ICMP_EQ, CCVT, RHS, One); + auto IsMinusOne = + Builder.buildICmp(CmpInst::Predicate::ICMP_EQ, CCVT, RHS, MinusOne); + auto IsOneOrMinusOne = Builder.buildOr(CCVT, IsOne, IsMinusOne); + AShr = Builder.buildSelect(Ty, IsOneOrMinusOne, LHS, AShr); + + // If divided by a positive value, we're done. Otherwise, the result must be + // negated. + auto Neg = Builder.buildNeg(Ty, AShr); + auto IsNeg = Builder.buildICmp(CmpInst::Predicate::ICMP_SLT, CCVT, RHS, Zero); + Builder.buildSelect(MI.getOperand(0).getReg(), IsNeg, Neg, AShr); + MI.eraseFromParent(); +} + +void CombinerHelper::applyUDivByPow2(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_UDIV && "Expected UDIV"); + auto &UDiv = cast(MI); + Register Dst = UDiv.getReg(0); + Register LHS = UDiv.getReg(1); + Register RHS = UDiv.getReg(2); + LLT Ty = MRI.getType(Dst); + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); + + auto C1 = Builder.buildCTTZ(ShiftAmtTy, RHS); + Builder.buildLShr(MI.getOperand(0).getReg(), LHS, C1); + MI.eraseFromParent(); +} + bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) { assert(MI.getOpcode() == TargetOpcode::G_UMULH); Register RHS = MI.getOperand(2).getReg(); @@ -5294,7 +5335,6 @@ void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) { LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty); unsigned NumEltBits = Ty.getScalarSizeInBits(); - Builder.setInstrAndDebugLoc(MI); auto LogBase2 = buildLogBase2(RHS, Builder); auto ShiftAmt = Builder.buildSub(Ty, Builder.buildConstant(Ty, NumEltBits), LogBase2); @@ -5374,7 +5414,6 @@ bool CombinerHelper::matchFsubToFneg(MachineInstr &MI, Register &MatchInfo) { } void CombinerHelper::applyFsubToFneg(MachineInstr &MI, Register &MatchInfo) { - Builder.setInstrAndDebugLoc(MI); Register Dst = MI.getOperand(0).getReg(); Builder.buildFNeg( Dst, Builder.buildFCanonicalize(MRI.getType(Dst), MatchInfo).getReg(0)); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 4981d7b80b0b2..797bbf7efe605 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3768,9 +3768,11 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { } case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: { auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs(); - MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal, + Register NewOldValRes = MRI.cloneVirtualRegister(OldValRes); + MIRBuilder.buildAtomicCmpXchg(NewOldValRes, Addr, CmpVal, NewVal, **MI.memoperands_begin()); - MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal); + MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, NewOldValRes, CmpVal); + MIRBuilder.buildCopy(OldValRes, NewOldValRes); MI.eraseFromParent(); return Legalized; } @@ -3789,8 +3791,12 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { case G_UADDO: { auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs(); - MIRBuilder.buildAdd(Res, LHS, RHS); - MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS); + Register NewRes = MRI.cloneVirtualRegister(Res); + + MIRBuilder.buildAdd(NewRes, LHS, RHS); + MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, NewRes, RHS); + + MIRBuilder.buildCopy(Res, NewRes); MI.eraseFromParent(); return Legalized; @@ -3800,6 +3806,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { const LLT CondTy = MRI.getType(CarryOut); const LLT Ty = MRI.getType(Res); + Register NewRes = MRI.cloneVirtualRegister(Res); + // Initial add of the two operands. auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS); @@ -3808,15 +3816,18 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) { // Add the sum and the carry. auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn); - MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn); + MIRBuilder.buildAdd(NewRes, TmpRes, ZExtCarryIn); // Second check for carry. We can only carry if the initial sum is all 1s // and the carry is set, resulting in a new sum of 0. auto Zero = MIRBuilder.buildConstant(Ty, 0); - auto ResEqZero = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, Res, Zero); + auto ResEqZero = + MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, NewRes, Zero); auto Carry2 = MIRBuilder.buildAnd(CondTy, ResEqZero, CarryIn); MIRBuilder.buildOr(CarryOut, Carry, Carry2); + MIRBuilder.buildCopy(Res, NewRes); + MI.eraseFromParent(); return Legalized; } @@ -6389,12 +6400,26 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) { // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks. auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01))); - auto ResTmp = B.buildMul(Ty, B8Count, MulMask); // Shift count result from 8 high bits to low bits. auto C_SizeM8 = B.buildConstant(Ty, Size - 8); - B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); + auto IsMulSupported = [this](const LLT Ty) { + auto Action = LI.getAction({TargetOpcode::G_MUL, {Ty}}).Action; + return Action == Legal || Action == WidenScalar || Action == Custom; + }; + if (IsMulSupported(Ty)) { + auto ResTmp = B.buildMul(Ty, B8Count, MulMask); + B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); + } else { + auto ResTmp = B8Count; + for (unsigned Shift = 8; Shift < Size; Shift *= 2) { + auto ShiftC = B.buildConstant(Ty, Shift); + auto Shl = B.buildShl(Ty, ResTmp, ShiftC); + ResTmp = B.buildAdd(Ty, ResTmp, Shl); + } + B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8); + } MI.eraseFromParent(); return Legalized; } @@ -7657,10 +7682,12 @@ LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { LLT Ty = Dst0Ty; LLT BoolTy = Dst1Ty; + Register NewDst0 = MRI.cloneVirtualRegister(Dst0); + if (IsAdd) - MIRBuilder.buildAdd(Dst0, LHS, RHS); + MIRBuilder.buildAdd(NewDst0, LHS, RHS); else - MIRBuilder.buildSub(Dst0, LHS, RHS); + MIRBuilder.buildSub(NewDst0, LHS, RHS); // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow. @@ -7673,12 +7700,15 @@ LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) { // (LHS) if and only if the other operand (RHS) is (non-zero) positive, // otherwise there will be overflow. auto ResultLowerThanLHS = - MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS); + MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS); auto ConditionRHS = MIRBuilder.buildICmp( IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero); MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS); + + MIRBuilder.buildCopy(Dst0, NewDst0); MI.eraseFromParent(); + return Legalized; } diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 8c41f8b1bdcdb..c3bc3203b6360 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -997,6 +997,74 @@ llvm::ConstantFoldCountZeros(Register Src, const MachineRegisterInfo &MRI, return std::nullopt; } +std::optional> +llvm::ConstantFoldICmp(unsigned Pred, const Register Op1, const Register Op2, + const MachineRegisterInfo &MRI) { + LLT Ty = MRI.getType(Op1); + if (Ty != MRI.getType(Op2)) + return std::nullopt; + + auto TryFoldScalar = [&MRI, Pred](Register LHS, + Register RHS) -> std::optional { + auto LHSCst = getIConstantVRegVal(LHS, MRI); + auto RHSCst = getIConstantVRegVal(RHS, MRI); + if (!LHSCst || !RHSCst) + return std::nullopt; + + switch (Pred) { + case CmpInst::Predicate::ICMP_EQ: + return APInt(/*numBits=*/1, LHSCst->eq(*RHSCst)); + case CmpInst::Predicate::ICMP_NE: + return APInt(/*numBits=*/1, LHSCst->ne(*RHSCst)); + case CmpInst::Predicate::ICMP_UGT: + return APInt(/*numBits=*/1, LHSCst->ugt(*RHSCst)); + case CmpInst::Predicate::ICMP_UGE: + return APInt(/*numBits=*/1, LHSCst->uge(*RHSCst)); + case CmpInst::Predicate::ICMP_ULT: + return APInt(/*numBits=*/1, LHSCst->ult(*RHSCst)); + case CmpInst::Predicate::ICMP_ULE: + return APInt(/*numBits=*/1, LHSCst->ule(*RHSCst)); + case CmpInst::Predicate::ICMP_SGT: + return APInt(/*numBits=*/1, LHSCst->sgt(*RHSCst)); + case CmpInst::Predicate::ICMP_SGE: + return APInt(/*numBits=*/1, LHSCst->sge(*RHSCst)); + case CmpInst::Predicate::ICMP_SLT: + return APInt(/*numBits=*/1, LHSCst->slt(*RHSCst)); + case CmpInst::Predicate::ICMP_SLE: + return APInt(/*numBits=*/1, LHSCst->sle(*RHSCst)); + default: + return std::nullopt; + } + }; + + SmallVector FoldedICmps; + + if (Ty.isVector()) { + // Try to constant fold each element. + auto *BV1 = getOpcodeDef(Op1, MRI); + auto *BV2 = getOpcodeDef(Op2, MRI); + if (!BV1 || !BV2) + return std::nullopt; + assert(BV1->getNumSources() == BV2->getNumSources() && "Invalid vectors"); + for (unsigned I = 0; I < BV1->getNumSources(); ++I) { + if (auto MaybeFold = + TryFoldScalar(BV1->getSourceReg(I), BV2->getSourceReg(I))) { + FoldedICmps.emplace_back(*MaybeFold); + continue; + } + return std::nullopt; + } + return FoldedICmps; + } + + if (auto MaybeCst = TryFoldScalar(Op1, Op2)) { + FoldedICmps.emplace_back(*MaybeCst); + return FoldedICmps; + } + + return std::nullopt; +} + bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI, GISelKnownBits *KB) { std::optional DefSrcReg = diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index c581738add3d6..d4b9f8e2f8e6d 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" @@ -553,6 +554,11 @@ uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) { MIFlags |= MachineInstr::MIFlag::NoSWrap; if (OB->hasNoUnsignedWrap()) MIFlags |= MachineInstr::MIFlag::NoUWrap; + } else if (const TruncInst *TI = dyn_cast(&I)) { + if (TI->hasNoSignedWrap()) + MIFlags |= MachineInstr::MIFlag::NoSWrap; + if (TI->hasNoUnsignedWrap()) + MIFlags |= MachineInstr::MIFlag::NoUWrap; } // Copy the nonneg flag. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 8be03b66e155f..962f0d98e3be9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8710,11 +8710,21 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const { } // v = (v * 0x01010101...) >> (Len - 8) - SDValue Mask01 = - DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT); - return DAG.getNode(ISD::SRL, dl, VT, - DAG.getNode(ISD::MUL, dl, VT, Op, Mask01), - DAG.getConstant(Len - 8, dl, ShVT)); + SDValue V; + if (isOperationLegalOrCustomOrPromote( + ISD::MUL, getTypeToTransformTo(*DAG.getContext(), VT))) { + SDValue Mask01 = + DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT); + V = DAG.getNode(ISD::MUL, dl, VT, Op, Mask01); + } else { + V = Op; + for (unsigned Shift = 8; Shift < Len; Shift *= 2) { + SDValue ShiftC = DAG.getShiftAmountConstant(Shift, VT, dl); + V = DAG.getNode(ISD::ADD, dl, VT, V, + DAG.getNode(ISD::SHL, dl, VT, V, ShiftC)); + } + } + return DAG.getNode(ISD::SRL, dl, VT, V, DAG.getConstant(Len - 8, dl, ShVT)); } SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const { @@ -8767,10 +8777,22 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const { return Op; // v = (v * 0x01010101...) >> (Len - 8) - SDValue Mask01 = - DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT); - return DAG.getNode(ISD::VP_LSHR, dl, VT, - DAG.getNode(ISD::VP_MUL, dl, VT, Op, Mask01, Mask, VL), + SDValue V; + if (isOperationLegalOrCustomOrPromote( + ISD::VP_MUL, getTypeToTransformTo(*DAG.getContext(), VT))) { + SDValue Mask01 = + DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT); + V = DAG.getNode(ISD::VP_MUL, dl, VT, Op, Mask01, Mask, VL); + } else { + V = Op; + for (unsigned Shift = 8; Shift < Len; Shift *= 2) { + SDValue ShiftC = DAG.getShiftAmountConstant(Shift, VT, dl); + V = DAG.getNode(ISD::VP_ADD, dl, VT, V, + DAG.getNode(ISD::VP_SHL, dl, VT, V, ShiftC, Mask, VL), + Mask, VL); + } + } + return DAG.getNode(ISD::VP_LSHR, dl, VT, V, DAG.getConstant(Len - 8, dl, ShVT), Mask, VL); } diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp index b0830308908d6..89aea3a291611 100644 --- a/llvm/lib/CodeGen/TypePromotion.cpp +++ b/llvm/lib/CodeGen/TypePromotion.cpp @@ -643,7 +643,7 @@ void IRPromoter::ConvertTruncs() { ConstantInt *Mask = ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue()); Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask); - if (SrcTy != ExtTy) + if (SrcTy->getBitWidth() > ExtTy->getBitWidth()) Masked = Builder.CreateTrunc(Masked, ExtTy); if (auto *I = dyn_cast(Masked)) diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 2e633205f8e26..b443fd2eb8183 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1424,6 +1424,11 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) { } else if (const auto *NNI = dyn_cast(U)) { if (NNI->hasNonNeg()) Out << " nneg"; + } else if (const auto *TI = dyn_cast(U)) { + if (TI->hasNoUnsignedWrap()) + Out << " nuw"; + if (TI->hasNoSignedWrap()) + Out << " nsw"; } } diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 47a7f2c9de790..0602a55b9fe7f 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -370,11 +370,17 @@ bool Instruction::isOnlyUserOfAnyOperand() { } void Instruction::setHasNoUnsignedWrap(bool b) { - cast(this)->setHasNoUnsignedWrap(b); + if (auto *Inst = dyn_cast(this)) + Inst->setHasNoUnsignedWrap(b); + else + cast(this)->setHasNoUnsignedWrap(b); } void Instruction::setHasNoSignedWrap(bool b) { - cast(this)->setHasNoSignedWrap(b); + if (auto *Inst = dyn_cast(this)) + Inst->setHasNoSignedWrap(b); + else + cast(this)->setHasNoSignedWrap(b); } void Instruction::setIsExact(bool b) { @@ -388,11 +394,17 @@ void Instruction::setNonNeg(bool b) { } bool Instruction::hasNoUnsignedWrap() const { - return cast(this)->hasNoUnsignedWrap(); + if (auto *Inst = dyn_cast(this)) + return Inst->hasNoUnsignedWrap(); + + return cast(this)->hasNoUnsignedWrap(); } bool Instruction::hasNoSignedWrap() const { - return cast(this)->hasNoSignedWrap(); + if (auto *Inst = dyn_cast(this)) + return Inst->hasNoSignedWrap(); + + return cast(this)->hasNoSignedWrap(); } bool Instruction::hasNonNeg() const { @@ -432,6 +444,11 @@ void Instruction::dropPoisonGeneratingFlags() { case Instruction::ZExt: setNonNeg(false); break; + + case Instruction::Trunc: + cast(this)->setHasNoUnsignedWrap(false); + cast(this)->setHasNoSignedWrap(false); + break; } if (isa(this)) { @@ -626,6 +643,13 @@ void Instruction::andIRFlags(const Value *V) { } } + if (auto *TI = dyn_cast(V)) { + if (isa(this)) { + setHasNoSignedWrap(hasNoSignedWrap() && TI->hasNoSignedWrap()); + setHasNoUnsignedWrap(hasNoUnsignedWrap() && TI->hasNoUnsignedWrap()); + } + } + if (auto *PE = dyn_cast(V)) if (isa(this)) setIsExact(isExact() && PE->isExact()); diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp index b9cd219d94dc8..7b4449cd825f9 100644 --- a/llvm/lib/IR/Operator.cpp +++ b/llvm/lib/IR/Operator.cpp @@ -27,6 +27,11 @@ bool Operator::hasPoisonGeneratingFlags() const { auto *OBO = cast(this); return OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap(); } + case Instruction::Trunc: { + if (auto *TI = dyn_cast(this)) + return TI->hasNoUnsignedWrap() || TI->hasNoSignedWrap(); + return false; + } case Instruction::UDiv: case Instruction::SDiv: case Instruction::AShr: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index bee43b6c18c88..f283af6fa07d3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -851,12 +851,7 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const { return true; case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrID = N->getConstantOperandVal(0); - switch (IntrID) { - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: - return true; - } - return false; + return AMDGPU::isIntrinsicAlwaysUniform(IntrID); } case ISD::LOAD: if (cast(N)->getMemOperand()->getAddrSpace() == diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index c5d7ee76275f8..cd388ed3e3191 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -285,13 +285,16 @@ def RSqrt : DXILOpMapping<25, unary, int_dx_rsqrt, "Returns the reciprocal of the square root of the specified value." "rsqrt(x) = 1 / sqrt(x).", [llvm_halforfloat_ty, LLVMMatchType<0>]>; -def Round : DXILOpMapping<26, unary, int_round, +def Round : DXILOpMapping<26, unary, int_roundeven, "Returns the input rounded to the nearest integer" "within a floating-point type.", [llvm_halforfloat_ty, LLVMMatchType<0>]>; def Floor : DXILOpMapping<27, unary, int_floor, "Returns the largest integer that is less than or equal to the input.", [llvm_halforfloat_ty, LLVMMatchType<0>]>; +def Ceil : DXILOpMapping<28, unary, int_ceil, + "Returns the smallest integer that is greater than or equal to the input.", + [llvm_halforfloat_ty, LLVMMatchType<0>]>; def Trunc : DXILOpMapping<29, unary, int_trunc, "Returns the specified value truncated to the integer component.", [llvm_halforfloat_ty, LLVMMatchType<0>]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index ece9821a2d0d9..9f31b72bbceb1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1022,7 +1022,6 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, const DataLayout &DL = getDataLayout(); // GlobalVariables are always constant pointers themselves. - PointerType *PTy = GVar->getType(); Type *ETy = GVar->getValueType(); if (GVar->hasExternalLinkage()) { @@ -1030,6 +1029,9 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << ".visible "; else O << ".extern "; + } else if (STI.getPTXVersion() >= 50 && GVar->hasCommonLinkage() && + GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) { + O << ".common "; } else if (GVar->hasLinkOnceLinkage() || GVar->hasWeakLinkage() || GVar->hasAvailableExternallyLinkage() || GVar->hasCommonLinkage()) { @@ -1141,7 +1143,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, } O << "."; - emitPTXAddressSpace(PTy->getAddressSpace(), O); + emitPTXAddressSpace(GVar->getAddressSpace(), O); if (isManaged(*GVar)) { if (STI.getPTXVersion() < 40 || STI.getSmVersion() < 30) { @@ -1170,8 +1172,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, // Ptx allows variable initilization only for constant and global state // spaces. if (GVar->hasInitializer()) { - if ((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) { + if ((GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) || + (GVar->getAddressSpace() == ADDRESS_SPACE_CONST)) { const Constant *Initializer = GVar->getInitializer(); // 'undef' is treated as there is no value specified. if (!Initializer->isNullValue() && !isa(Initializer)) { @@ -1186,7 +1188,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, !isa(GVar->getInitializer())) { report_fatal_error("initial value of '" + GVar->getName() + "' is not allowed in addrspace(" + - Twine(PTy->getAddressSpace()) + ")"); + Twine(GVar->getAddressSpace()) + ")"); } } } @@ -1205,8 +1207,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, ElementSize = DL.getTypeStoreSize(ETy); // Ptx allows variable initilization only for constant and // global state spaces. - if (((PTy->getAddressSpace() == ADDRESS_SPACE_GLOBAL) || - (PTy->getAddressSpace() == ADDRESS_SPACE_CONST)) && + if (((GVar->getAddressSpace() == ADDRESS_SPACE_GLOBAL) || + (GVar->getAddressSpace() == ADDRESS_SPACE_CONST)) && GVar->hasInitializer()) { const Constant *Initializer = GVar->getInitializer(); if (!isa(Initializer) && !Initializer->isNullValue()) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 3cd9ecb9dd681..e48ca4a905ce9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13530,7 +13530,7 @@ struct CombineResult; enum ExtKind : uint8_t { ZExt = 1 << 0, SExt = 1 << 1, FPExt = 1 << 2 }; /// Helper class for folding sign/zero extensions. /// In particular, this class is used for the following combines: -/// add | add_vl -> vwadd(u) | vwadd(u)_w +/// add | add_vl | or disjoint -> vwadd(u) | vwadd(u)_w /// sub | sub_vl -> vwsub(u) | vwsub(u)_w /// mul | mul_vl -> vwmul(u) | vwmul_su /// fadd -> vfwadd | vfwadd_w @@ -13678,6 +13678,7 @@ struct NodeExtensionHelper { case RISCVISD::ADD_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: + case ISD::OR: return RISCVISD::VWADD_VL; case ISD::SUB: case RISCVISD::SUB_VL: @@ -13700,6 +13701,7 @@ struct NodeExtensionHelper { case RISCVISD::ADD_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: + case ISD::OR: return RISCVISD::VWADDU_VL; case ISD::SUB: case RISCVISD::SUB_VL: @@ -13745,6 +13747,7 @@ struct NodeExtensionHelper { switch (Opcode) { case ISD::ADD: case RISCVISD::ADD_VL: + case ISD::OR: return SupportsExt == ExtKind::SExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL; case ISD::SUB: @@ -13865,6 +13868,10 @@ struct NodeExtensionHelper { case ISD::MUL: { return Root->getValueType(0).isScalableVector(); } + case ISD::OR: { + return Root->getValueType(0).isScalableVector() && + Root->getFlags().hasDisjoint(); + } // Vector Widening Integer Add/Sub/Mul Instructions case RISCVISD::ADD_VL: case RISCVISD::MUL_VL: @@ -13945,7 +13952,8 @@ struct NodeExtensionHelper { switch (Root->getOpcode()) { case ISD::ADD: case ISD::SUB: - case ISD::MUL: { + case ISD::MUL: + case ISD::OR: { SDLoc DL(Root); MVT VT = Root->getSimpleValueType(0); return getDefaultScalableVLOps(VT, DL, DAG, Subtarget); @@ -13968,6 +13976,7 @@ struct NodeExtensionHelper { switch (N->getOpcode()) { case ISD::ADD: case ISD::MUL: + case ISD::OR: case RISCVISD::ADD_VL: case RISCVISD::MUL_VL: case RISCVISD::VWADD_W_VL: @@ -14034,6 +14043,7 @@ struct CombineResult { case ISD::ADD: case ISD::SUB: case ISD::MUL: + case ISD::OR: Merge = DAG.getUNDEF(Root->getValueType(0)); break; } @@ -14184,6 +14194,7 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { switch (Root->getOpcode()) { case ISD::ADD: case ISD::SUB: + case ISD::OR: case RISCVISD::ADD_VL: case RISCVISD::SUB_VL: case RISCVISD::FADD_VL: @@ -14227,9 +14238,9 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { /// Combine a binary operation to its equivalent VW or VW_W form. /// The supported combines are: -/// add_vl -> vwadd(u) | vwadd(u)_w -/// sub_vl -> vwsub(u) | vwsub(u)_w -/// mul_vl -> vwmul(u) | vwmul_su +/// add | add_vl | or disjoint -> vwadd(u) | vwadd(u)_w +/// sub | sub_vl -> vwsub(u) | vwsub(u)_w +/// mul | mul_vl -> vwmul(u) | vwmul_su /// fadd_vl -> vfwadd | vfwadd_w /// fsub_vl -> vfwsub | vfwsub_w /// fmul_vl -> vfwmul @@ -15889,8 +15900,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::AND: return performANDCombine(N, DCI, Subtarget); - case ISD::OR: + case ISD::OR: { + if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) + return V; return performORCombine(N, DCI, Subtarget); + } case ISD::XOR: return performXORCombine(N, DAG, Subtarget); case ISD::MUL: diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index e42ac68a8b67f..8cdaa7f2e5ea4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -2133,19 +2133,6 @@ multiclass VPseudoBinary { - let VLMul = MInfo.value, SEW=sew in { - defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); - def suffix : VPseudoBinaryNoMaskTU; - } -} - multiclass VPseudoBinaryRoundingMode; def "_" # MInfo.MX # "_MASK_TIED" : VPseudoTiedBinaryMask; + Constraint, TargetConstraintType>, + RISCVMaskedPseudo; } } @@ -2225,7 +2213,8 @@ multiclass VPseudoTiedBinaryRoundingMode; def "_" # MInfo.MX # "_MASK_TIED" : - VPseudoTiedBinaryMaskRoundingMode; + VPseudoTiedBinaryMaskRoundingMode, + RISCVMaskedPseudo; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td index 60db03d68e476..e66b061c760ac 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td @@ -233,6 +233,19 @@ class VPseudoTernaryNoMask_Zvk(PseudoToVInst.VInst); } +multiclass VPseudoBinaryNoMaskTU_Zvk { + let VLMul = MInfo.value, SEW=sew in { + defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX); + def suffix : VPseudoBinaryNoMaskTU; + } +} + multiclass VPseudoTernaryNoMask_Zvk("WriteVIALUV_" # mx); defvar ReadVIALUV_MX = !cast("ReadVIALUV_" # mx); - defm _VI : VPseudoBinaryNoMask, + defm _VI : VPseudoBinaryNoMaskTU_Zvk, Sched<[WriteVIALUV_MX, ReadVIALUV_MX, ReadVIALUV_MX, ReadVMask]>; } } @@ -317,7 +330,7 @@ multiclass VPseudoVALU_VV_NoMaskTU_Zvk { defvar WriteVIALUV_MX = !cast("WriteVIALUV_" # mx); defvar ReadVIALUV_MX = !cast("ReadVIALUV_" # mx); - defm _VV : VPseudoBinaryNoMask, + defm _VV : VPseudoBinaryNoMaskTU_Zvk, Sched<[WriteVIALUV_MX, ReadVIALUV_MX, ReadVIALUV_MX, ReadVMask]>; } } diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index fef0a5a90cd6f..c45ec8981ab1f 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -334,6 +334,44 @@ let Predicates = [In64BitMode] in { def IMUL32rmi_EVEX : IMulOpMI_RF, PL; def IMUL64rmi32_EVEX : IMulOpMI_RF, PL; } + +// IMULZU instructions +class IMulZUOpRI8_R + : BinOpRI8<0x6B, "imulzu", binop_ndd_args, t, MRMSrcReg, + (outs t.RegClass:$dst)> { + let SchedRW = [sched]; +} +class IMulZUOpRI_R + : BinOpRI<0x69, "imulzu", binop_ndd_args, t, MRMSrcReg, + (outs t.RegClass:$dst), []> { + let SchedRW = [sched]; +} +class IMulZUOpMI8_R + : BinOpMI8<"imulzu", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)> { + let Opcode = 0x6B; + let SchedRW = [sched.Folded]; +} +class IMulZUOpMI_R + : BinOpMI<0x69, "imulzu", binop_ndd_args, t, MRMSrcMem, + (outs t.RegClass:$dst), []> { + let SchedRW = [sched.Folded]; +} + +let Defs = [EFLAGS], Predicates = [HasEGPR, In64BitMode] in { + def IMULZU16rri8 : IMulZUOpRI8_R, ZU, PD; + def IMULZU16rmi8 : IMulZUOpMI8_R, ZU, PD; + def IMULZU16rri : IMulZUOpRI_R, ZU, PD; + def IMULZU16rmi : IMulZUOpMI_R, ZU, PD; + def IMULZU32rri8 : IMulZUOpRI8_R, ZU; + def IMULZU32rmi8 : IMulZUOpMI8_R, ZU; + def IMULZU32rri : IMulZUOpRI_R, ZU; + def IMULZU32rmi : IMulZUOpMI_R, ZU; + def IMULZU64rri8 : IMulZUOpRI8_R, ZU; + def IMULZU64rmi8 : IMulZUOpMI8_R, ZU; + def IMULZU64rri32 : IMulZUOpRI_R, ZU; + def IMULZU64rmi32 : IMulZUOpMI_R, ZU; +} + //===----------------------------------------------------------------------===// // INC and DEC Instructions // diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td index 04d9d104ebc4b..8387b76a40cdd 100644 --- a/llvm/lib/Target/X86/X86InstrUtils.td +++ b/llvm/lib/Target/X86/X86InstrUtils.td @@ -119,6 +119,8 @@ class NDD { class NF: T_MAP4, EVEX, EVEX_NF; // PL - Helper for promoted legacy instructions class PL: T_MAP4, EVEX, ExplicitEVEXPrefix; +// ZU - Helper for Zero Upper instructions +class ZU: T_MAP4, EVEX, EVEX_B; //===----------------------------------------------------------------------===// // X86 Type infomation definitions diff --git a/llvm/lib/TextAPI/BinaryReader/CMakeLists.txt b/llvm/lib/TextAPI/BinaryReader/CMakeLists.txt index cbdf7b2c96969..c4535310d91c1 100644 --- a/llvm/lib/TextAPI/BinaryReader/CMakeLists.txt +++ b/llvm/lib/TextAPI/BinaryReader/CMakeLists.txt @@ -2,6 +2,7 @@ add_llvm_component_library(LLVMTextAPIBinaryReader DylibReader.cpp LINK_COMPONENTS + DebugInfoDWARF Support Object TextAPI diff --git a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp index 2e36d4a8b98ce..f92a2d19a63fc 100644 --- a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp +++ b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp @@ -12,7 +12,8 @@ #include "llvm/TextAPI/DylibReader.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringMap.h" +#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/Object/Binary.h" #include "llvm/Object/MachOUniversal.h" #include "llvm/Support/Endian.h" @@ -432,3 +433,111 @@ DylibReader::get(MemoryBufferRef Buffer) { return convertToInterfaceFile(*SlicesOrErr); } + +static void DWARFErrorHandler(Error Err) { /**/ } + +static SymbolToSourceLocMap +accumulateLocs(MachOObjectFile &Obj, + const std::unique_ptr &DiCtx) { + SymbolToSourceLocMap LocMap; + for (const auto &Symbol : Obj.symbols()) { + Expected FlagsOrErr = Symbol.getFlags(); + if (!FlagsOrErr) { + consumeError(FlagsOrErr.takeError()); + continue; + } + + if (!(*FlagsOrErr & SymbolRef::SF_Exported)) + continue; + + Expected AddressOrErr = Symbol.getAddress(); + if (!AddressOrErr) { + consumeError(AddressOrErr.takeError()); + continue; + } + const uint64_t Address = *AddressOrErr; + + auto TypeOrErr = Symbol.getType(); + if (!TypeOrErr) { + consumeError(TypeOrErr.takeError()); + continue; + } + const bool IsCode = (*TypeOrErr & SymbolRef::ST_Function); + + auto *DWARFCU = IsCode ? DiCtx->getCompileUnitForCodeAddress(Address) + : DiCtx->getCompileUnitForDataAddress(Address); + if (!DWARFCU) + continue; + + const DWARFDie &DIE = IsCode ? DWARFCU->getSubroutineForAddress(Address) + : DWARFCU->getVariableForAddress(Address); + const std::string File = DIE.getDeclFile( + llvm::DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath); + const uint64_t Line = DIE.getDeclLine(); + + auto NameOrErr = Symbol.getName(); + if (!NameOrErr) { + consumeError(NameOrErr.takeError()); + continue; + } + auto Name = *NameOrErr; + auto Sym = parseSymbol(Name); + + if (!File.empty() && Line != 0) + LocMap.insert({Sym.Name.str(), RecordLoc(File, Line)}); + } + + return LocMap; +} + +SymbolToSourceLocMap +DylibReader::accumulateSourceLocFromDSYM(const StringRef DSYM, + const Target &T) { + // Find sidecar file. + auto DSYMsOrErr = MachOObjectFile::findDsymObjectMembers(DSYM); + if (!DSYMsOrErr) { + consumeError(DSYMsOrErr.takeError()); + return SymbolToSourceLocMap(); + } + if (DSYMsOrErr->empty()) + return SymbolToSourceLocMap(); + + const StringRef Path = DSYMsOrErr->front(); + auto BufOrErr = MemoryBuffer::getFile(Path); + if (auto Err = BufOrErr.getError()) + return SymbolToSourceLocMap(); + + auto BinOrErr = createBinary(*BufOrErr.get()); + if (!BinOrErr) { + consumeError(BinOrErr.takeError()); + return SymbolToSourceLocMap(); + } + // Handle single arch. + if (auto *Single = dyn_cast(BinOrErr->get())) { + auto DiCtx = DWARFContext::create( + *Single, DWARFContext::ProcessDebugRelocations::Process, nullptr, "", + DWARFErrorHandler, DWARFErrorHandler); + + return accumulateLocs(*Single, DiCtx); + } + // Handle universal companion file. + if (auto *Fat = dyn_cast(BinOrErr->get())) { + auto ObjForArch = Fat->getObjectForArch(getArchitectureName(T.Arch)); + if (!ObjForArch) { + consumeError(ObjForArch.takeError()); + return SymbolToSourceLocMap(); + } + auto MachOOrErr = ObjForArch->getAsObjectFile(); + if (!MachOOrErr) { + consumeError(MachOOrErr.takeError()); + return SymbolToSourceLocMap(); + } + auto &Obj = **MachOOrErr; + auto DiCtx = DWARFContext::create( + Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, "", + DWARFErrorHandler, DWARFErrorHandler); + + return accumulateLocs(Obj, DiCtx); + } + return SymbolToSourceLocMap(); +} diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt index 034f1587ae8df..5fbdbc3a014f9 100644 --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -35,6 +35,7 @@ add_llvm_component_library(LLVMipo PartialInlining.cpp SampleContextTracker.cpp SampleProfile.cpp + SampleProfileMatcher.cpp SampleProfileProbe.cpp SCCP.cpp StripDeadPrototypes.cpp diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 7545a92c114ef..b5f45a252c7b4 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -71,6 +71,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/ProfiledCallGraph.h" #include "llvm/Transforms/IPO/SampleContextTracker.h" +#include "llvm/Transforms/IPO/SampleProfileMatcher.h" #include "llvm/Transforms/IPO/SampleProfileProbe.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/CallPromotionUtils.h" @@ -129,16 +130,16 @@ static cl::opt SampleProfileRemappingFile( "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden); -static cl::opt SalvageStaleProfile( +cl::opt SalvageStaleProfile( "salvage-stale-profile", cl::Hidden, cl::init(false), cl::desc("Salvage stale profile by fuzzy matching and use the remapped " "location for sample profile query.")); -static cl::opt ReportProfileStaleness( +cl::opt ReportProfileStaleness( "report-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute and report stale profile statistical metrics.")); -static cl::opt PersistProfileStaleness( +cl::opt PersistProfileStaleness( "persist-profile-staleness", cl::Hidden, cl::init(false), cl::desc("Compute stale profile statistical metrics and write it into the " "native object file(.llvm_stats section).")); @@ -448,138 +449,6 @@ using CandidateQueue = PriorityQueue, CandidateComparer>; -// Sample profile matching - fuzzy match. -class SampleProfileMatcher { - Module &M; - SampleProfileReader &Reader; - const PseudoProbeManager *ProbeManager; - const ThinOrFullLTOPhase LTOPhase; - SampleProfileMap FlattenedProfiles; - // For each function, the matcher generates a map, of which each entry is a - // mapping from the source location of current build to the source location in - // the profile. - StringMap FuncMappings; - - // Match state for an anchor/callsite. - enum class MatchState { - Unknown = 0, - // Initial match between input profile and current IR. - InitialMatch = 1, - // Initial mismatch between input profile and current IR. - InitialMismatch = 2, - // InitialMatch stays matched after fuzzy profile matching. - UnchangedMatch = 3, - // InitialMismatch stays mismatched after fuzzy profile matching. - UnchangedMismatch = 4, - // InitialMismatch is recovered after fuzzy profile matching. - RecoveredMismatch = 5, - // InitialMatch is removed and becomes mismatched after fuzzy profile - // matching. - RemovedMatch = 6, - }; - - // For each function, store every callsite and its matching state into this - // map, of which each entry is a pair of callsite location and MatchState. - // This is used for profile staleness computation and report. - StringMap> - FuncCallsiteMatchStates; - - // Profile mismatch statstics: - uint64_t TotalProfiledFunc = 0; - // Num of checksum-mismatched function. - uint64_t NumStaleProfileFunc = 0; - uint64_t TotalProfiledCallsites = 0; - uint64_t NumMismatchedCallsites = 0; - uint64_t NumRecoveredCallsites = 0; - // Total samples for all profiled functions. - uint64_t TotalFunctionSamples = 0; - // Total samples for all checksum-mismatched functions. - uint64_t MismatchedFunctionSamples = 0; - uint64_t MismatchedCallsiteSamples = 0; - uint64_t RecoveredCallsiteSamples = 0; - - // A dummy name for unknown indirect callee, used to differentiate from a - // non-call instruction that also has an empty callee name. - static constexpr const char *UnknownIndirectCallee = - "unknown.indirect.callee"; - -public: - SampleProfileMatcher(Module &M, SampleProfileReader &Reader, - const PseudoProbeManager *ProbeManager, - ThinOrFullLTOPhase LTOPhase) - : M(M), Reader(Reader), ProbeManager(ProbeManager), LTOPhase(LTOPhase){}; - void runOnModule(); - void clearMatchingData() { - // Do not clear FuncMappings, it stores IRLoc to ProfLoc remappings which - // will be used for sample loader. - FuncCallsiteMatchStates.clear(); - } - -private: - FunctionSamples *getFlattenedSamplesFor(const Function &F) { - StringRef CanonFName = FunctionSamples::getCanonicalFnName(F); - auto It = FlattenedProfiles.find(FunctionId(CanonFName)); - if (It != FlattenedProfiles.end()) - return &It->second; - return nullptr; - } - void runOnFunction(Function &F); - void findIRAnchors(const Function &F, - std::map &IRAnchors); - void findProfileAnchors( - const FunctionSamples &FS, - std::map> &ProfileAnchors); - // Record the callsite match states for profile staleness report, the result - // is saved in FuncCallsiteMatchStates. - void recordCallsiteMatchStates( - const Function &F, const std::map &IRAnchors, - const std::map> - &ProfileAnchors, - const LocToLocMap *IRToProfileLocationMap); - - bool isMismatchState(const enum MatchState &State) { - return State == MatchState::InitialMismatch || - State == MatchState::UnchangedMismatch || - State == MatchState::RemovedMatch; - }; - - bool isInitialState(const enum MatchState &State) { - return State == MatchState::InitialMatch || - State == MatchState::InitialMismatch; - }; - - bool isFinalState(const enum MatchState &State) { - return State == MatchState::UnchangedMatch || - State == MatchState::UnchangedMismatch || - State == MatchState::RecoveredMismatch || - State == MatchState::RemovedMatch; - }; - - // Count the samples of checksum mismatched function for the top-level - // function and all inlinees. - void countMismatchedFuncSamples(const FunctionSamples &FS, bool IsTopLevel); - // Count the number of mismatched or recovered callsites. - void countMismatchCallsites(const FunctionSamples &FS); - // Count the samples of mismatched or recovered callsites for top-level - // function and all inlinees. - void countMismatchedCallsiteSamples(const FunctionSamples &FS); - void computeAndReportProfileStaleness(); - - LocToLocMap &getIRToProfileLocationMap(const Function &F) { - auto Ret = FuncMappings.try_emplace( - FunctionSamples::getCanonicalFnName(F.getName()), LocToLocMap()); - return Ret.first->second; - } - void distributeIRToProfileLocationMap(); - void distributeIRToProfileLocationMap(FunctionSamples &FS); - void runStaleProfileMatching( - const Function &F, const std::map &IRAnchors, - const std::map> - &ProfileAnchors, - LocToLocMap &IRToProfileLocationMap); - void reportOrPersistProfileStats(); -}; - /// Sample profile pass. /// /// This pass reads profile data from the file specified by @@ -766,10 +635,6 @@ void SampleProfileLoaderBaseImpl::computeDominanceAndLoopInfo( } } // namespace llvm -static bool skipProfileForFunction(const Function &F) { - return F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"); -} - ErrorOr SampleProfileLoader::getInstWeight(const Instruction &Inst) { if (FunctionSamples::ProfileIsProbeBased) return getProbeWeight(Inst); @@ -2262,535 +2127,6 @@ bool SampleProfileLoader::rejectHighStalenessProfile( return false; } -void SampleProfileMatcher::findIRAnchors( - const Function &F, std::map &IRAnchors) { - // For inlined code, recover the original callsite and callee by finding the - // top-level inline frame. e.g. For frame stack "main:1 @ foo:2 @ bar:3", the - // top-level frame is "main:1", the callsite is "1" and the callee is "foo". - auto FindTopLevelInlinedCallsite = [](const DILocation *DIL) { - assert((DIL && DIL->getInlinedAt()) && "No inlined callsite"); - const DILocation *PrevDIL = nullptr; - do { - PrevDIL = DIL; - DIL = DIL->getInlinedAt(); - } while (DIL->getInlinedAt()); - - LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(DIL); - StringRef CalleeName = PrevDIL->getSubprogramLinkageName(); - return std::make_pair(Callsite, CalleeName); - }; - - auto GetCanonicalCalleeName = [](const CallBase *CB) { - StringRef CalleeName = UnknownIndirectCallee; - if (Function *Callee = CB->getCalledFunction()) - CalleeName = FunctionSamples::getCanonicalFnName(Callee->getName()); - return CalleeName; - }; - - // Extract profile matching anchors in the IR. - for (auto &BB : F) { - for (auto &I : BB) { - DILocation *DIL = I.getDebugLoc(); - if (!DIL) - continue; - - if (FunctionSamples::ProfileIsProbeBased) { - if (auto Probe = extractProbe(I)) { - // Flatten inlined IR for the matching. - if (DIL->getInlinedAt()) { - IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL)); - } else { - // Use empty StringRef for basic block probe. - StringRef CalleeName; - if (const auto *CB = dyn_cast(&I)) { - // Skip the probe inst whose callee name is "llvm.pseudoprobe". - if (!isa(&I)) - CalleeName = GetCanonicalCalleeName(CB); - } - IRAnchors.emplace(LineLocation(Probe->Id, 0), CalleeName); - } - } - } else { - // TODO: For line-number based profile(AutoFDO), currently only support - // find callsite anchors. In future, we need to parse all the non-call - // instructions to extract the line locations for profile matching. - if (!isa(&I) || isa(&I)) - continue; - - if (DIL->getInlinedAt()) { - IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL)); - } else { - LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(DIL); - StringRef CalleeName = GetCanonicalCalleeName(dyn_cast(&I)); - IRAnchors.emplace(Callsite, CalleeName); - } - } - } - } -} - -void SampleProfileMatcher::findProfileAnchors( - const FunctionSamples &FS, - std::map> &ProfileAnchors) { - auto isInvalidLineOffset = [](uint32_t LineOffset) { - return LineOffset & 0x8000; - }; - - for (const auto &I : FS.getBodySamples()) { - const LineLocation &Loc = I.first; - if (isInvalidLineOffset(Loc.LineOffset)) - continue; - for (const auto &I : I.second.getCallTargets()) { - auto Ret = ProfileAnchors.try_emplace(Loc, - std::unordered_set()); - Ret.first->second.insert(I.first); - } - } - - for (const auto &I : FS.getCallsiteSamples()) { - const LineLocation &Loc = I.first; - if (isInvalidLineOffset(Loc.LineOffset)) - continue; - const auto &CalleeMap = I.second; - for (const auto &I : CalleeMap) { - auto Ret = ProfileAnchors.try_emplace(Loc, - std::unordered_set()); - Ret.first->second.insert(I.first); - } - } -} - -// Call target name anchor based profile fuzzy matching. -// Input: -// For IR locations, the anchor is the callee name of direct callsite; For -// profile locations, it's the call target name for BodySamples or inlinee's -// profile name for CallsiteSamples. -// Matching heuristic: -// First match all the anchors in lexical order, then split the non-anchor -// locations between the two anchors evenly, first half are matched based on the -// start anchor, second half are matched based on the end anchor. -// For example, given: -// IR locations: [1, 2(foo), 3, 5, 6(bar), 7] -// Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9] -// The matching gives: -// [1, 2(foo), 3, 5, 6(bar), 7] -// | | | | | | -// [1, 2, 3(foo), 4, 7, 8(bar), 9] -// The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9]. -void SampleProfileMatcher::runStaleProfileMatching( - const Function &F, const std::map &IRAnchors, - const std::map> - &ProfileAnchors, - LocToLocMap &IRToProfileLocationMap) { - LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName() - << "\n"); - assert(IRToProfileLocationMap.empty() && - "Run stale profile matching only once per function"); - - std::unordered_map> - CalleeToCallsitesMap; - for (const auto &I : ProfileAnchors) { - const auto &Loc = I.first; - const auto &Callees = I.second; - // Filter out possible indirect calls, use direct callee name as anchor. - if (Callees.size() == 1) { - FunctionId CalleeName = *Callees.begin(); - const auto &Candidates = CalleeToCallsitesMap.try_emplace( - CalleeName, std::set()); - Candidates.first->second.insert(Loc); - } - } - - auto InsertMatching = [&](const LineLocation &From, const LineLocation &To) { - // Skip the unchanged location mapping to save memory. - if (From != To) - IRToProfileLocationMap.insert({From, To}); - }; - - // Use function's beginning location as the initial anchor. - int32_t LocationDelta = 0; - SmallVector LastMatchedNonAnchors; - - for (const auto &IR : IRAnchors) { - const auto &Loc = IR.first; - auto CalleeName = IR.second; - bool IsMatchedAnchor = false; - // Match the anchor location in lexical order. - if (!CalleeName.empty()) { - auto CandidateAnchors = CalleeToCallsitesMap.find( - getRepInFormat(CalleeName)); - if (CandidateAnchors != CalleeToCallsitesMap.end() && - !CandidateAnchors->second.empty()) { - auto CI = CandidateAnchors->second.begin(); - const auto Candidate = *CI; - CandidateAnchors->second.erase(CI); - InsertMatching(Loc, Candidate); - LLVM_DEBUG(dbgs() << "Callsite with callee:" << CalleeName - << " is matched from " << Loc << " to " << Candidate - << "\n"); - LocationDelta = Candidate.LineOffset - Loc.LineOffset; - - // Match backwards for non-anchor locations. - // The locations in LastMatchedNonAnchors have been matched forwards - // based on the previous anchor, spilt it evenly and overwrite the - // second half based on the current anchor. - for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2; - I < LastMatchedNonAnchors.size(); I++) { - const auto &L = LastMatchedNonAnchors[I]; - uint32_t CandidateLineOffset = L.LineOffset + LocationDelta; - LineLocation Candidate(CandidateLineOffset, L.Discriminator); - InsertMatching(L, Candidate); - LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L - << " to " << Candidate << "\n"); - } - - IsMatchedAnchor = true; - LastMatchedNonAnchors.clear(); - } - } - - // Match forwards for non-anchor locations. - if (!IsMatchedAnchor) { - uint32_t CandidateLineOffset = Loc.LineOffset + LocationDelta; - LineLocation Candidate(CandidateLineOffset, Loc.Discriminator); - InsertMatching(Loc, Candidate); - LLVM_DEBUG(dbgs() << "Location is matched from " << Loc << " to " - << Candidate << "\n"); - LastMatchedNonAnchors.emplace_back(Loc); - } - } -} - -void SampleProfileMatcher::runOnFunction(Function &F) { - // We need to use flattened function samples for matching. - // Unlike IR, which includes all callsites from the source code, the callsites - // in profile only show up when they are hit by samples, i,e. the profile - // callsites in one context may differ from those in another context. To get - // the maximum number of callsites, we merge the function profiles from all - // contexts, aka, the flattened profile to find profile anchors. - const auto *FSFlattened = getFlattenedSamplesFor(F); - if (!FSFlattened) - return; - - // Anchors for IR. It's a map from IR location to callee name, callee name is - // empty for non-call instruction and use a dummy name(UnknownIndirectCallee) - // for unknown indrect callee name. - std::map IRAnchors; - findIRAnchors(F, IRAnchors); - // Anchors for profile. It's a map from callsite location to a set of callee - // name. - std::map> ProfileAnchors; - findProfileAnchors(*FSFlattened, ProfileAnchors); - - // Compute the callsite match states for profile staleness report. - if (ReportProfileStaleness || PersistProfileStaleness) - recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, nullptr); - - // Run profile matching for checksum mismatched profile, currently only - // support for pseudo-probe. - if (SalvageStaleProfile && FunctionSamples::ProfileIsProbeBased && - !ProbeManager->profileIsValid(F, *FSFlattened)) { - // For imported functions, the checksum metadata(pseudo_probe_desc) are - // dropped, so we leverage function attribute(profile-checksum-mismatch) to - // transfer the info: add the attribute during pre-link phase and check it - // during post-link phase(see "profileIsValid"). - if (FunctionSamples::ProfileIsProbeBased && - LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) - F.addFnAttr("profile-checksum-mismatch"); - - // The matching result will be saved to IRToProfileLocationMap, create a - // new map for each function. - auto &IRToProfileLocationMap = getIRToProfileLocationMap(F); - runStaleProfileMatching(F, IRAnchors, ProfileAnchors, - IRToProfileLocationMap); - // Find and update callsite match states after matching. - if (ReportProfileStaleness || PersistProfileStaleness) - recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, - &IRToProfileLocationMap); - } -} - -void SampleProfileMatcher::recordCallsiteMatchStates( - const Function &F, const std::map &IRAnchors, - const std::map> - &ProfileAnchors, - const LocToLocMap *IRToProfileLocationMap) { - bool IsPostMatch = IRToProfileLocationMap != nullptr; - auto &CallsiteMatchStates = - FuncCallsiteMatchStates[FunctionSamples::getCanonicalFnName(F.getName())]; - - auto MapIRLocToProfileLoc = [&](const LineLocation &IRLoc) { - // IRToProfileLocationMap is null in pre-match phrase. - if (!IRToProfileLocationMap) - return IRLoc; - const auto &ProfileLoc = IRToProfileLocationMap->find(IRLoc); - if (ProfileLoc != IRToProfileLocationMap->end()) - return ProfileLoc->second; - else - return IRLoc; - }; - - for (const auto &I : IRAnchors) { - // After fuzzy profile matching, use the matching result to remap the - // current IR callsite. - const auto &ProfileLoc = MapIRLocToProfileLoc(I.first); - const auto &IRCalleeName = I.second; - const auto &It = ProfileAnchors.find(ProfileLoc); - if (It == ProfileAnchors.end()) - continue; - const auto &Callees = It->second; - - bool IsCallsiteMatched = false; - // Since indirect call does not have CalleeName, check conservatively if - // callsite in the profile is a callsite location. This is to reduce num of - // false positive since otherwise all the indirect call samples will be - // reported as mismatching. - if (IRCalleeName == SampleProfileMatcher::UnknownIndirectCallee) - IsCallsiteMatched = true; - else if (Callees.size() == 1 && Callees.count(getRepInFormat(IRCalleeName))) - IsCallsiteMatched = true; - - if (IsCallsiteMatched) { - auto It = CallsiteMatchStates.find(ProfileLoc); - if (It == CallsiteMatchStates.end()) - CallsiteMatchStates.emplace(ProfileLoc, MatchState::InitialMatch); - else if (IsPostMatch) { - if (It->second == MatchState::InitialMatch) - It->second = MatchState::UnchangedMatch; - else if (It->second == MatchState::InitialMismatch) - It->second = MatchState::RecoveredMismatch; - } - } - } - - // Check if there are any callsites in the profile that does not match to any - // IR callsites. - for (const auto &I : ProfileAnchors) { - const auto &Loc = I.first; - [[maybe_unused]] const auto &Callees = I.second; - assert(!Callees.empty() && "Callees should not be empty"); - auto It = CallsiteMatchStates.find(Loc); - if (It == CallsiteMatchStates.end()) - CallsiteMatchStates.emplace(Loc, MatchState::InitialMismatch); - else if (IsPostMatch) { - // Update the state if it's not matched(UnchangedMatch or - // RecoveredMismatch). - if (It->second == MatchState::InitialMismatch) - It->second = MatchState::UnchangedMismatch; - else if (It->second == MatchState::InitialMatch) - It->second = MatchState::RemovedMatch; - } - } -} - -void SampleProfileMatcher::countMismatchedFuncSamples(const FunctionSamples &FS, - bool IsTopLevel) { - const auto *FuncDesc = ProbeManager->getDesc(FS.getGUID()); - // Skip the function that is external or renamed. - if (!FuncDesc) - return; - - if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS)) { - if (IsTopLevel) - NumStaleProfileFunc++; - // Given currently all probe ids are after block probe ids, once the - // checksum is mismatched, it's likely all the callites are mismatched and - // dropped. We conservatively count all the samples as mismatched and stop - // counting the inlinees' profiles. - MismatchedFunctionSamples += FS.getTotalSamples(); - return; - } - - // Even the current-level function checksum is matched, it's possible that the - // nested inlinees' checksums are mismatched that affect the inlinee's sample - // loading, we need to go deeper to check the inlinees' function samples. - // Similarly, count all the samples as mismatched if the inlinee's checksum is - // mismatched using this recursive function. - for (const auto &I : FS.getCallsiteSamples()) - for (const auto &CS : I.second) - countMismatchedFuncSamples(CS.second, false); -} - -void SampleProfileMatcher::countMismatchedCallsiteSamples( - const FunctionSamples &FS) { - auto It = FuncCallsiteMatchStates.find(FS.getFuncName()); - // Skip it if no mismatched callsite or this is an external function. - if (It == FuncCallsiteMatchStates.end() || It->second.empty()) - return; - const auto &CallsiteMatchStates = It->second; - - auto findMatchState = [&](const LineLocation &Loc) { - auto It = CallsiteMatchStates.find(Loc); - if (It == CallsiteMatchStates.end()) - return MatchState::Unknown; - return It->second; - }; - - auto AttributeMismatchedSamples = [&](const enum MatchState &State, - uint64_t Samples) { - if (isMismatchState(State)) - MismatchedCallsiteSamples += Samples; - else if (State == MatchState::RecoveredMismatch) - RecoveredCallsiteSamples += Samples; - }; - - // The non-inlined callsites are saved in the body samples of function - // profile, go through it to count the non-inlined callsite samples. - for (const auto &I : FS.getBodySamples()) - AttributeMismatchedSamples(findMatchState(I.first), I.second.getSamples()); - - // Count the inlined callsite samples. - for (const auto &I : FS.getCallsiteSamples()) { - auto State = findMatchState(I.first); - uint64_t CallsiteSamples = 0; - for (const auto &CS : I.second) - CallsiteSamples += CS.second.getTotalSamples(); - AttributeMismatchedSamples(State, CallsiteSamples); - - if (isMismatchState(State)) - continue; - - // When the current level of inlined call site matches the profiled call - // site, we need to go deeper along the inline tree to count mismatches from - // lower level inlinees. - for (const auto &CS : I.second) - countMismatchedCallsiteSamples(CS.second); - } -} - -void SampleProfileMatcher::countMismatchCallsites(const FunctionSamples &FS) { - auto It = FuncCallsiteMatchStates.find(FS.getFuncName()); - // Skip it if no mismatched callsite or this is an external function. - if (It == FuncCallsiteMatchStates.end() || It->second.empty()) - return; - const auto &MatchStates = It->second; - [[maybe_unused]] bool OnInitialState = - isInitialState(MatchStates.begin()->second); - for (const auto &I : MatchStates) { - TotalProfiledCallsites++; - assert( - (OnInitialState ? isInitialState(I.second) : isFinalState(I.second)) && - "Profile matching state is inconsistent"); - - if (isMismatchState(I.second)) - NumMismatchedCallsites++; - else if (I.second == MatchState::RecoveredMismatch) - NumRecoveredCallsites++; - } -} - -void SampleProfileMatcher::computeAndReportProfileStaleness() { - if (!ReportProfileStaleness && !PersistProfileStaleness) - return; - - // Count profile mismatches for profile staleness report. - for (const auto &F : M) { - if (skipProfileForFunction(F)) - continue; - // As the stats will be merged by linker, skip reporting the metrics for - // imported functions to avoid repeated counting. - if (GlobalValue::isAvailableExternallyLinkage(F.getLinkage())) - continue; - const auto *FS = Reader.getSamplesFor(F); - if (!FS) - continue; - TotalProfiledFunc++; - TotalFunctionSamples += FS->getTotalSamples(); - - // Checksum mismatch is only used in pseudo-probe mode. - if (FunctionSamples::ProfileIsProbeBased) - countMismatchedFuncSamples(*FS, true); - - // Count mismatches and samples for calliste. - countMismatchCallsites(*FS); - countMismatchedCallsiteSamples(*FS); - } - - if (ReportProfileStaleness) { - if (FunctionSamples::ProfileIsProbeBased) { - errs() << "(" << NumStaleProfileFunc << "/" << TotalProfiledFunc << ")" - << " of functions' profile are invalid and " - << " (" << MismatchedFunctionSamples << "/" << TotalFunctionSamples - << ") of samples are discarded due to function hash mismatch.\n"; - } - errs() << "(" << (NumMismatchedCallsites + NumRecoveredCallsites) << "/" - << TotalProfiledCallsites << ")" - << " of callsites' profile are invalid and " - << "(" << (MismatchedCallsiteSamples + RecoveredCallsiteSamples) - << "/" << TotalFunctionSamples << ")" - << " of samples are discarded due to callsite location mismatch.\n"; - errs() << "(" << NumRecoveredCallsites << "/" - << (NumRecoveredCallsites + NumMismatchedCallsites) << ")" - << " of callsites and " - << "(" << RecoveredCallsiteSamples << "/" - << (RecoveredCallsiteSamples + MismatchedCallsiteSamples) << ")" - << " of samples are recovered by stale profile matching.\n"; - } - - if (PersistProfileStaleness) { - LLVMContext &Ctx = M.getContext(); - MDBuilder MDB(Ctx); - - SmallVector> ProfStatsVec; - if (FunctionSamples::ProfileIsProbeBased) { - ProfStatsVec.emplace_back("NumStaleProfileFunc", NumStaleProfileFunc); - ProfStatsVec.emplace_back("TotalProfiledFunc", TotalProfiledFunc); - ProfStatsVec.emplace_back("MismatchedFunctionSamples", - MismatchedFunctionSamples); - ProfStatsVec.emplace_back("TotalFunctionSamples", TotalFunctionSamples); - } - - ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites); - ProfStatsVec.emplace_back("NumRecoveredCallsites", NumRecoveredCallsites); - ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites); - ProfStatsVec.emplace_back("MismatchedCallsiteSamples", - MismatchedCallsiteSamples); - ProfStatsVec.emplace_back("RecoveredCallsiteSamples", - RecoveredCallsiteSamples); - - auto *MD = MDB.createLLVMStats(ProfStatsVec); - auto *NMD = M.getOrInsertNamedMetadata("llvm.stats"); - NMD->addOperand(MD); - } -} - -void SampleProfileMatcher::runOnModule() { - ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles, - FunctionSamples::ProfileIsCS); - for (auto &F : M) { - if (skipProfileForFunction(F)) - continue; - runOnFunction(F); - } - if (SalvageStaleProfile) - distributeIRToProfileLocationMap(); - - computeAndReportProfileStaleness(); -} - -void SampleProfileMatcher::distributeIRToProfileLocationMap( - FunctionSamples &FS) { - const auto ProfileMappings = FuncMappings.find(FS.getFuncName()); - if (ProfileMappings != FuncMappings.end()) { - FS.setIRToProfileLocationMap(&(ProfileMappings->second)); - } - - for (auto &Callees : - const_cast(FS.getCallsiteSamples())) { - for (auto &FS : Callees.second) { - distributeIRToProfileLocationMap(FS.second); - } - } -} - -// Use a central place to distribute the matching results. Outlined and inlined -// profile with the function name will be set to the same pointer. -void SampleProfileMatcher::distributeIRToProfileLocationMap() { - for (auto &I : Reader.getProfiles()) { - distributeIRToProfileLocationMap(I.second); - } -} - bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, ProfileSummaryInfo *_PSI, LazyCallGraph &CG) { diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp new file mode 100644 index 0000000000000..bb46539989ab5 --- /dev/null +++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp @@ -0,0 +1,552 @@ +//===- SampleProfileMatcher.cpp - Sampling-based Stale Profile Matcher ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the SampleProfileMatcher used for stale +// profile matching. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/SampleProfileMatcher.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" + +using namespace llvm; +using namespace sampleprof; + +#define DEBUG_TYPE "sample-profile-matcher" + +extern cl::opt SalvageStaleProfile; +extern cl::opt PersistProfileStaleness; +extern cl::opt ReportProfileStaleness; + +void SampleProfileMatcher::findIRAnchors( + const Function &F, std::map &IRAnchors) { + // For inlined code, recover the original callsite and callee by finding the + // top-level inline frame. e.g. For frame stack "main:1 @ foo:2 @ bar:3", the + // top-level frame is "main:1", the callsite is "1" and the callee is "foo". + auto FindTopLevelInlinedCallsite = [](const DILocation *DIL) { + assert((DIL && DIL->getInlinedAt()) && "No inlined callsite"); + const DILocation *PrevDIL = nullptr; + do { + PrevDIL = DIL; + DIL = DIL->getInlinedAt(); + } while (DIL->getInlinedAt()); + + LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(DIL); + StringRef CalleeName = PrevDIL->getSubprogramLinkageName(); + return std::make_pair(Callsite, CalleeName); + }; + + auto GetCanonicalCalleeName = [](const CallBase *CB) { + StringRef CalleeName = UnknownIndirectCallee; + if (Function *Callee = CB->getCalledFunction()) + CalleeName = FunctionSamples::getCanonicalFnName(Callee->getName()); + return CalleeName; + }; + + // Extract profile matching anchors in the IR. + for (auto &BB : F) { + for (auto &I : BB) { + DILocation *DIL = I.getDebugLoc(); + if (!DIL) + continue; + + if (FunctionSamples::ProfileIsProbeBased) { + if (auto Probe = extractProbe(I)) { + // Flatten inlined IR for the matching. + if (DIL->getInlinedAt()) { + IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL)); + } else { + // Use empty StringRef for basic block probe. + StringRef CalleeName; + if (const auto *CB = dyn_cast(&I)) { + // Skip the probe inst whose callee name is "llvm.pseudoprobe". + if (!isa(&I)) + CalleeName = GetCanonicalCalleeName(CB); + } + IRAnchors.emplace(LineLocation(Probe->Id, 0), CalleeName); + } + } + } else { + // TODO: For line-number based profile(AutoFDO), currently only support + // find callsite anchors. In future, we need to parse all the non-call + // instructions to extract the line locations for profile matching. + if (!isa(&I) || isa(&I)) + continue; + + if (DIL->getInlinedAt()) { + IRAnchors.emplace(FindTopLevelInlinedCallsite(DIL)); + } else { + LineLocation Callsite = FunctionSamples::getCallSiteIdentifier(DIL); + StringRef CalleeName = GetCanonicalCalleeName(dyn_cast(&I)); + IRAnchors.emplace(Callsite, CalleeName); + } + } + } + } +} + +void SampleProfileMatcher::findProfileAnchors( + const FunctionSamples &FS, + std::map> &ProfileAnchors) { + auto isInvalidLineOffset = [](uint32_t LineOffset) { + return LineOffset & 0x8000; + }; + + for (const auto &I : FS.getBodySamples()) { + const LineLocation &Loc = I.first; + if (isInvalidLineOffset(Loc.LineOffset)) + continue; + for (const auto &I : I.second.getCallTargets()) { + auto Ret = + ProfileAnchors.try_emplace(Loc, std::unordered_set()); + Ret.first->second.insert(I.first); + } + } + + for (const auto &I : FS.getCallsiteSamples()) { + const LineLocation &Loc = I.first; + if (isInvalidLineOffset(Loc.LineOffset)) + continue; + const auto &CalleeMap = I.second; + for (const auto &I : CalleeMap) { + auto Ret = + ProfileAnchors.try_emplace(Loc, std::unordered_set()); + Ret.first->second.insert(I.first); + } + } +} + +// Call target name anchor based profile fuzzy matching. +// Input: +// For IR locations, the anchor is the callee name of direct callsite; For +// profile locations, it's the call target name for BodySamples or inlinee's +// profile name for CallsiteSamples. +// Matching heuristic: +// First match all the anchors in lexical order, then split the non-anchor +// locations between the two anchors evenly, first half are matched based on the +// start anchor, second half are matched based on the end anchor. +// For example, given: +// IR locations: [1, 2(foo), 3, 5, 6(bar), 7] +// Profile locations: [1, 2, 3(foo), 4, 7, 8(bar), 9] +// The matching gives: +// [1, 2(foo), 3, 5, 6(bar), 7] +// | | | | | | +// [1, 2, 3(foo), 4, 7, 8(bar), 9] +// The output mapping: [2->3, 3->4, 5->7, 6->8, 7->9]. +void SampleProfileMatcher::runStaleProfileMatching( + const Function &F, const std::map &IRAnchors, + const std::map> + &ProfileAnchors, + LocToLocMap &IRToProfileLocationMap) { + LLVM_DEBUG(dbgs() << "Run stale profile matching for " << F.getName() + << "\n"); + assert(IRToProfileLocationMap.empty() && + "Run stale profile matching only once per function"); + + std::unordered_map> CalleeToCallsitesMap; + for (const auto &I : ProfileAnchors) { + const auto &Loc = I.first; + const auto &Callees = I.second; + // Filter out possible indirect calls, use direct callee name as anchor. + if (Callees.size() == 1) { + FunctionId CalleeName = *Callees.begin(); + const auto &Candidates = CalleeToCallsitesMap.try_emplace( + CalleeName, std::set()); + Candidates.first->second.insert(Loc); + } + } + + auto InsertMatching = [&](const LineLocation &From, const LineLocation &To) { + // Skip the unchanged location mapping to save memory. + if (From != To) + IRToProfileLocationMap.insert({From, To}); + }; + + // Use function's beginning location as the initial anchor. + int32_t LocationDelta = 0; + SmallVector LastMatchedNonAnchors; + + for (const auto &IR : IRAnchors) { + const auto &Loc = IR.first; + auto CalleeName = IR.second; + bool IsMatchedAnchor = false; + // Match the anchor location in lexical order. + if (!CalleeName.empty()) { + auto CandidateAnchors = + CalleeToCallsitesMap.find(getRepInFormat(CalleeName)); + if (CandidateAnchors != CalleeToCallsitesMap.end() && + !CandidateAnchors->second.empty()) { + auto CI = CandidateAnchors->second.begin(); + const auto Candidate = *CI; + CandidateAnchors->second.erase(CI); + InsertMatching(Loc, Candidate); + LLVM_DEBUG(dbgs() << "Callsite with callee:" << CalleeName + << " is matched from " << Loc << " to " << Candidate + << "\n"); + LocationDelta = Candidate.LineOffset - Loc.LineOffset; + + // Match backwards for non-anchor locations. + // The locations in LastMatchedNonAnchors have been matched forwards + // based on the previous anchor, spilt it evenly and overwrite the + // second half based on the current anchor. + for (size_t I = (LastMatchedNonAnchors.size() + 1) / 2; + I < LastMatchedNonAnchors.size(); I++) { + const auto &L = LastMatchedNonAnchors[I]; + uint32_t CandidateLineOffset = L.LineOffset + LocationDelta; + LineLocation Candidate(CandidateLineOffset, L.Discriminator); + InsertMatching(L, Candidate); + LLVM_DEBUG(dbgs() << "Location is rematched backwards from " << L + << " to " << Candidate << "\n"); + } + + IsMatchedAnchor = true; + LastMatchedNonAnchors.clear(); + } + } + + // Match forwards for non-anchor locations. + if (!IsMatchedAnchor) { + uint32_t CandidateLineOffset = Loc.LineOffset + LocationDelta; + LineLocation Candidate(CandidateLineOffset, Loc.Discriminator); + InsertMatching(Loc, Candidate); + LLVM_DEBUG(dbgs() << "Location is matched from " << Loc << " to " + << Candidate << "\n"); + LastMatchedNonAnchors.emplace_back(Loc); + } + } +} + +void SampleProfileMatcher::runOnFunction(Function &F) { + // We need to use flattened function samples for matching. + // Unlike IR, which includes all callsites from the source code, the callsites + // in profile only show up when they are hit by samples, i,e. the profile + // callsites in one context may differ from those in another context. To get + // the maximum number of callsites, we merge the function profiles from all + // contexts, aka, the flattened profile to find profile anchors. + const auto *FSFlattened = getFlattenedSamplesFor(F); + if (!FSFlattened) + return; + + // Anchors for IR. It's a map from IR location to callee name, callee name is + // empty for non-call instruction and use a dummy name(UnknownIndirectCallee) + // for unknown indrect callee name. + std::map IRAnchors; + findIRAnchors(F, IRAnchors); + // Anchors for profile. It's a map from callsite location to a set of callee + // name. + std::map> ProfileAnchors; + findProfileAnchors(*FSFlattened, ProfileAnchors); + + // Compute the callsite match states for profile staleness report. + if (ReportProfileStaleness || PersistProfileStaleness) + recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, nullptr); + + // Run profile matching for checksum mismatched profile, currently only + // support for pseudo-probe. + if (SalvageStaleProfile && FunctionSamples::ProfileIsProbeBased && + !ProbeManager->profileIsValid(F, *FSFlattened)) { + // For imported functions, the checksum metadata(pseudo_probe_desc) are + // dropped, so we leverage function attribute(profile-checksum-mismatch) to + // transfer the info: add the attribute during pre-link phase and check it + // during post-link phase(see "profileIsValid"). + if (FunctionSamples::ProfileIsProbeBased && + LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) + F.addFnAttr("profile-checksum-mismatch"); + + // The matching result will be saved to IRToProfileLocationMap, create a + // new map for each function. + auto &IRToProfileLocationMap = getIRToProfileLocationMap(F); + runStaleProfileMatching(F, IRAnchors, ProfileAnchors, + IRToProfileLocationMap); + // Find and update callsite match states after matching. + if (ReportProfileStaleness || PersistProfileStaleness) + recordCallsiteMatchStates(F, IRAnchors, ProfileAnchors, + &IRToProfileLocationMap); + } +} + +void SampleProfileMatcher::recordCallsiteMatchStates( + const Function &F, const std::map &IRAnchors, + const std::map> + &ProfileAnchors, + const LocToLocMap *IRToProfileLocationMap) { + bool IsPostMatch = IRToProfileLocationMap != nullptr; + auto &CallsiteMatchStates = + FuncCallsiteMatchStates[FunctionSamples::getCanonicalFnName(F.getName())]; + + auto MapIRLocToProfileLoc = [&](const LineLocation &IRLoc) { + // IRToProfileLocationMap is null in pre-match phrase. + if (!IRToProfileLocationMap) + return IRLoc; + const auto &ProfileLoc = IRToProfileLocationMap->find(IRLoc); + if (ProfileLoc != IRToProfileLocationMap->end()) + return ProfileLoc->second; + else + return IRLoc; + }; + + for (const auto &I : IRAnchors) { + // After fuzzy profile matching, use the matching result to remap the + // current IR callsite. + const auto &ProfileLoc = MapIRLocToProfileLoc(I.first); + const auto &IRCalleeName = I.second; + const auto &It = ProfileAnchors.find(ProfileLoc); + if (It == ProfileAnchors.end()) + continue; + const auto &Callees = It->second; + + bool IsCallsiteMatched = false; + // Since indirect call does not have CalleeName, check conservatively if + // callsite in the profile is a callsite location. This is to reduce num of + // false positive since otherwise all the indirect call samples will be + // reported as mismatching. + if (IRCalleeName == SampleProfileMatcher::UnknownIndirectCallee) + IsCallsiteMatched = true; + else if (Callees.size() == 1 && Callees.count(getRepInFormat(IRCalleeName))) + IsCallsiteMatched = true; + + if (IsCallsiteMatched) { + auto It = CallsiteMatchStates.find(ProfileLoc); + if (It == CallsiteMatchStates.end()) + CallsiteMatchStates.emplace(ProfileLoc, MatchState::InitialMatch); + else if (IsPostMatch) { + if (It->second == MatchState::InitialMatch) + It->second = MatchState::UnchangedMatch; + else if (It->second == MatchState::InitialMismatch) + It->second = MatchState::RecoveredMismatch; + } + } + } + + // Check if there are any callsites in the profile that does not match to any + // IR callsites. + for (const auto &I : ProfileAnchors) { + const auto &Loc = I.first; + [[maybe_unused]] const auto &Callees = I.second; + assert(!Callees.empty() && "Callees should not be empty"); + auto It = CallsiteMatchStates.find(Loc); + if (It == CallsiteMatchStates.end()) + CallsiteMatchStates.emplace(Loc, MatchState::InitialMismatch); + else if (IsPostMatch) { + // Update the state if it's not matched(UnchangedMatch or + // RecoveredMismatch). + if (It->second == MatchState::InitialMismatch) + It->second = MatchState::UnchangedMismatch; + else if (It->second == MatchState::InitialMatch) + It->second = MatchState::RemovedMatch; + } + } +} + +void SampleProfileMatcher::countMismatchedFuncSamples(const FunctionSamples &FS, + bool IsTopLevel) { + const auto *FuncDesc = ProbeManager->getDesc(FS.getGUID()); + // Skip the function that is external or renamed. + if (!FuncDesc) + return; + + if (ProbeManager->profileIsHashMismatched(*FuncDesc, FS)) { + if (IsTopLevel) + NumStaleProfileFunc++; + // Given currently all probe ids are after block probe ids, once the + // checksum is mismatched, it's likely all the callites are mismatched and + // dropped. We conservatively count all the samples as mismatched and stop + // counting the inlinees' profiles. + MismatchedFunctionSamples += FS.getTotalSamples(); + return; + } + + // Even the current-level function checksum is matched, it's possible that the + // nested inlinees' checksums are mismatched that affect the inlinee's sample + // loading, we need to go deeper to check the inlinees' function samples. + // Similarly, count all the samples as mismatched if the inlinee's checksum is + // mismatched using this recursive function. + for (const auto &I : FS.getCallsiteSamples()) + for (const auto &CS : I.second) + countMismatchedFuncSamples(CS.second, false); +} + +void SampleProfileMatcher::countMismatchedCallsiteSamples( + const FunctionSamples &FS) { + auto It = FuncCallsiteMatchStates.find(FS.getFuncName()); + // Skip it if no mismatched callsite or this is an external function. + if (It == FuncCallsiteMatchStates.end() || It->second.empty()) + return; + const auto &CallsiteMatchStates = It->second; + + auto findMatchState = [&](const LineLocation &Loc) { + auto It = CallsiteMatchStates.find(Loc); + if (It == CallsiteMatchStates.end()) + return MatchState::Unknown; + return It->second; + }; + + auto AttributeMismatchedSamples = [&](const enum MatchState &State, + uint64_t Samples) { + if (isMismatchState(State)) + MismatchedCallsiteSamples += Samples; + else if (State == MatchState::RecoveredMismatch) + RecoveredCallsiteSamples += Samples; + }; + + // The non-inlined callsites are saved in the body samples of function + // profile, go through it to count the non-inlined callsite samples. + for (const auto &I : FS.getBodySamples()) + AttributeMismatchedSamples(findMatchState(I.first), I.second.getSamples()); + + // Count the inlined callsite samples. + for (const auto &I : FS.getCallsiteSamples()) { + auto State = findMatchState(I.first); + uint64_t CallsiteSamples = 0; + for (const auto &CS : I.second) + CallsiteSamples += CS.second.getTotalSamples(); + AttributeMismatchedSamples(State, CallsiteSamples); + + if (isMismatchState(State)) + continue; + + // When the current level of inlined call site matches the profiled call + // site, we need to go deeper along the inline tree to count mismatches from + // lower level inlinees. + for (const auto &CS : I.second) + countMismatchedCallsiteSamples(CS.second); + } +} + +void SampleProfileMatcher::countMismatchCallsites(const FunctionSamples &FS) { + auto It = FuncCallsiteMatchStates.find(FS.getFuncName()); + // Skip it if no mismatched callsite or this is an external function. + if (It == FuncCallsiteMatchStates.end() || It->second.empty()) + return; + const auto &MatchStates = It->second; + [[maybe_unused]] bool OnInitialState = + isInitialState(MatchStates.begin()->second); + for (const auto &I : MatchStates) { + TotalProfiledCallsites++; + assert( + (OnInitialState ? isInitialState(I.second) : isFinalState(I.second)) && + "Profile matching state is inconsistent"); + + if (isMismatchState(I.second)) + NumMismatchedCallsites++; + else if (I.second == MatchState::RecoveredMismatch) + NumRecoveredCallsites++; + } +} + +void SampleProfileMatcher::computeAndReportProfileStaleness() { + if (!ReportProfileStaleness && !PersistProfileStaleness) + return; + + // Count profile mismatches for profile staleness report. + for (const auto &F : M) { + if (skipProfileForFunction(F)) + continue; + // As the stats will be merged by linker, skip reporting the metrics for + // imported functions to avoid repeated counting. + if (GlobalValue::isAvailableExternallyLinkage(F.getLinkage())) + continue; + const auto *FS = Reader.getSamplesFor(F); + if (!FS) + continue; + TotalProfiledFunc++; + TotalFunctionSamples += FS->getTotalSamples(); + + // Checksum mismatch is only used in pseudo-probe mode. + if (FunctionSamples::ProfileIsProbeBased) + countMismatchedFuncSamples(*FS, true); + + // Count mismatches and samples for calliste. + countMismatchCallsites(*FS); + countMismatchedCallsiteSamples(*FS); + } + + if (ReportProfileStaleness) { + if (FunctionSamples::ProfileIsProbeBased) { + errs() << "(" << NumStaleProfileFunc << "/" << TotalProfiledFunc + << ") of functions' profile are invalid and (" + << MismatchedFunctionSamples << "/" << TotalFunctionSamples + << ") of samples are discarded due to function hash mismatch.\n"; + } + errs() << "(" << (NumMismatchedCallsites + NumRecoveredCallsites) << "/" + << TotalProfiledCallsites + << ") of callsites' profile are invalid and (" + << (MismatchedCallsiteSamples + RecoveredCallsiteSamples) << "/" + << TotalFunctionSamples + << ") of samples are discarded due to callsite location mismatch.\n"; + errs() << "(" << NumRecoveredCallsites << "/" + << (NumRecoveredCallsites + NumMismatchedCallsites) + << ") of callsites and (" << RecoveredCallsiteSamples << "/" + << (RecoveredCallsiteSamples + MismatchedCallsiteSamples) + << ") of samples are recovered by stale profile matching.\n"; + } + + if (PersistProfileStaleness) { + LLVMContext &Ctx = M.getContext(); + MDBuilder MDB(Ctx); + + SmallVector> ProfStatsVec; + if (FunctionSamples::ProfileIsProbeBased) { + ProfStatsVec.emplace_back("NumStaleProfileFunc", NumStaleProfileFunc); + ProfStatsVec.emplace_back("TotalProfiledFunc", TotalProfiledFunc); + ProfStatsVec.emplace_back("MismatchedFunctionSamples", + MismatchedFunctionSamples); + ProfStatsVec.emplace_back("TotalFunctionSamples", TotalFunctionSamples); + } + + ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites); + ProfStatsVec.emplace_back("NumRecoveredCallsites", NumRecoveredCallsites); + ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites); + ProfStatsVec.emplace_back("MismatchedCallsiteSamples", + MismatchedCallsiteSamples); + ProfStatsVec.emplace_back("RecoveredCallsiteSamples", + RecoveredCallsiteSamples); + + auto *MD = MDB.createLLVMStats(ProfStatsVec); + auto *NMD = M.getOrInsertNamedMetadata("llvm.stats"); + NMD->addOperand(MD); + } +} + +void SampleProfileMatcher::runOnModule() { + ProfileConverter::flattenProfile(Reader.getProfiles(), FlattenedProfiles, + FunctionSamples::ProfileIsCS); + for (auto &F : M) { + if (skipProfileForFunction(F)) + continue; + runOnFunction(F); + } + if (SalvageStaleProfile) + distributeIRToProfileLocationMap(); + + computeAndReportProfileStaleness(); +} + +void SampleProfileMatcher::distributeIRToProfileLocationMap( + FunctionSamples &FS) { + const auto ProfileMappings = FuncMappings.find(FS.getFuncName()); + if (ProfileMappings != FuncMappings.end()) { + FS.setIRToProfileLocationMap(&(ProfileMappings->second)); + } + + for (auto &Callees : + const_cast(FS.getCallsiteSamples())) { + for (auto &FS : Callees.second) { + distributeIRToProfileLocationMap(FS.second); + } + } +} + +// Use a central place to distribute the matching results. Outlined and inlined +// profile with the function name will be set to the same pointer. +void SampleProfileMatcher::distributeIRToProfileLocationMap() { + for (auto &I : Reader.getProfiles()) { + distributeIRToProfileLocationMap(I.second); + } +} diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 089a70c6e6cca..0652a8ba80b3f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -734,19 +734,18 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { if (DestWidth == 1) { Value *Zero = Constant::getNullValue(SrcTy); - if (DestTy->isIntegerTy()) { - // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only). - // TODO: We canonicalize to more instructions here because we are probably - // lacking equivalent analysis for trunc relative to icmp. There may also - // be codegen concerns. If those trunc limitations were removed, we could - // remove this transform. - Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1)); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } - // For vectors, we do not canonicalize all truncs to icmp, so optimize - // patterns that would be covered within visitICmpInst. Value *X; + const APInt *C1; + Constant *C2; + if (match(Src, m_OneUse(m_Shr(m_Shl(m_Power2(C1), m_Value(X)), + m_ImmConstant(C2))))) { + // trunc ((C1 << X) >> C2) to i1 --> X == (C2-cttz(C1)), where C1 is pow2 + Constant *Log2C1 = ConstantInt::get(SrcTy, C1->exactLogBase2()); + Constant *CmpC = ConstantExpr::getSub(C2, Log2C1); + return new ICmpInst(ICmpInst::ICMP_EQ, X, CmpC); + } + Constant *C; if (match(Src, m_OneUse(m_LShr(m_Value(X), m_Constant(C))))) { // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0 @@ -763,6 +762,14 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { Value *And = Builder.CreateAnd(X, Builder.CreateOr(MaskC, One)); return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); } + + { + const APInt *C; + if (match(Src, m_Shl(m_APInt(C), m_Value(X))) && (*C)[0] == 1) { + // trunc (C << X) to i1 --> X == 0, where C is odd + return new ICmpInst(ICmpInst::Predicate::ICMP_EQ, X, Zero); + } + } } Value *A, *B; diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 3a3bcde7c3dcd..74cffbc005c82 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -59,6 +59,10 @@ PoisonFlags::PoisonFlags(const Instruction *I) { Disjoint = PDI->isDisjoint(); if (auto *PNI = dyn_cast(I)) NNeg = PNI->hasNonNeg(); + if (auto *TI = dyn_cast(I)) { + NUW = TI->hasNoUnsignedWrap(); + NSW = TI->hasNoSignedWrap(); + } } void PoisonFlags::apply(Instruction *I) { @@ -72,6 +76,10 @@ void PoisonFlags::apply(Instruction *I) { PDI->setIsDisjoint(Disjoint); if (auto *PNI = dyn_cast(I)) PNI->setNonNeg(NNeg); + if (isa(I)) { + I->setHasNoUnsignedWrap(NUW); + I->setHasNoSignedWrap(NSW); + } } /// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP, diff --git a/llvm/test/Assembler/flags.ll b/llvm/test/Assembler/flags.ll index 475e8d483441b..93cbbfd0461a0 100644 --- a/llvm/test/Assembler/flags.ll +++ b/llvm/test/Assembler/flags.ll @@ -261,3 +261,51 @@ define i64 @test_or(i64 %a, i64 %b) { %res = or disjoint i64 %a, %b ret i64 %res } + +define i32 @test_trunc_signed(i64 %a) { +; CHECK: %res = trunc nsw i64 %a to i32 + %res = trunc nsw i64 %a to i32 + ret i32 %res +} + +define i32 @test_trunc_unsigned(i64 %a) { +; CHECK: %res = trunc nuw i64 %a to i32 + %res = trunc nuw i64 %a to i32 + ret i32 %res +} + +define i32 @test_trunc_both(i64 %a) { +; CHECK: %res = trunc nuw nsw i64 %a to i32 + %res = trunc nuw nsw i64 %a to i32 + ret i32 %res +} + +define i32 @test_trunc_both_reversed(i64 %a) { +; CHECK: %res = trunc nuw nsw i64 %a to i32 + %res = trunc nsw nuw i64 %a to i32 + ret i32 %res +} + +define <2 x i32> @test_trunc_signed_vector(<2 x i64> %a) { +; CHECK: %res = trunc nsw <2 x i64> %a to <2 x i32> + %res = trunc nsw <2 x i64> %a to <2 x i32> + ret <2 x i32> %res +} + +define <2 x i32> @test_trunc_unsigned_vector(<2 x i64> %a) { +; CHECK: %res = trunc nuw <2 x i64> %a to <2 x i32> + %res = trunc nuw <2 x i64> %a to <2 x i32> + ret <2 x i32> %res +} + +define <2 x i32> @test_trunc_both_vector(<2 x i64> %a) { +; CHECK: %res = trunc nuw nsw <2 x i64> %a to <2 x i32> + %res = trunc nuw nsw <2 x i64> %a to <2 x i32> + ret <2 x i32> %res +} + +define <2 x i32> @test_trunc_both_reversed_vector(<2 x i64> %a) { +; CHECK: %res = trunc nuw nsw <2 x i64> %a to <2 x i32> + %res = trunc nsw nuw <2 x i64> %a to <2 x i32> + ret <2 x i32> %res +} diff --git a/llvm/test/Bitcode/flags.ll b/llvm/test/Bitcode/flags.ll index e3fc827d865d7..96995ec570c93 100644 --- a/llvm/test/Bitcode/flags.ll +++ b/llvm/test/Bitcode/flags.ll @@ -20,17 +20,34 @@ second: ; preds = %first %ll = zext i32 %s to i64 %jj = or disjoint i32 %a, 0 %oo = or i32 %a, 0 + %tu = trunc nuw i32 %a to i16 + %ts = trunc nsw i32 %a to i16 + %tus = trunc nuw nsw i32 %a to i16 + %t = trunc i32 %a to i16 + %tuv = trunc nuw <2 x i32> %aa to <2 x i16> + %tsv = trunc nsw <2 x i32> %aa to <2 x i16> + %tusv = trunc nuw nsw <2 x i32> %aa to <2 x i16> + %tv = trunc <2 x i32> %aa to <2 x i16> unreachable -first: ; preds = %entry - %a = bitcast i32 0 to i32 ; [#uses=8] - %uu = add nuw i32 %a, 0 ; [#uses=0] - %ss = add nsw i32 %a, 0 ; [#uses=0] - %uuss = add nuw nsw i32 %a, 0 ; [#uses=0] - %zz = add i32 %a, 0 ; [#uses=0] +first: ; preds = %entry + %aa = bitcast <2 x i32> to <2 x i32> + %a = bitcast i32 0 to i32 ; [#uses=8] + %uu = add nuw i32 %a, 0 ; [#uses=0] + %ss = add nsw i32 %a, 0 ; [#uses=0] + %uuss = add nuw nsw i32 %a, 0 ; [#uses=0] + %zz = add i32 %a, 0 ; [#uses=0] %kk = zext nneg i32 %a to i64 %rr = zext i32 %ss to i64 %mm = or disjoint i32 %a, 0 %nn = or i32 %a, 0 + %tuu = trunc nuw i32 %a to i16 + %tss = trunc nsw i32 %a to i16 + %tuss = trunc nuw nsw i32 %a to i16 + %tt = trunc i32 %a to i16 + %ttuv = trunc nuw <2 x i32> %aa to <2 x i16> + %ttsv = trunc nsw <2 x i32> %aa to <2 x i16> + %ttusv = trunc nuw nsw <2 x i32> %aa to <2 x i16> + %ttv = trunc <2 x i32> %aa to <2 x i16> br label %second } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll index 458c2cb76d9e3..7163da0dc0243 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -512,9 +512,9 @@ define i32 @fetch_and_nand(ptr %p) #0 { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB6_2 ; CHECK-NOLSE-O0-NEXT: LBB6_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB6_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB6_1 ; CHECK-NOLSE-O0-NEXT: b LBB6_5 @@ -540,9 +540,9 @@ define i32 @fetch_and_nand(ptr %p) #0 { ; CHECK-OUTLINE-O0-NEXT: mvn w1, w8 ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas4_rel ; CHECK-OUTLINE-O0-NEXT: ldr w8, [sp, #8] ; 4-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs w8, w0, w8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #28] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB6_1 ; CHECK-OUTLINE-O0-NEXT: b LBB6_2 @@ -582,9 +582,9 @@ define i32 @fetch_and_nand(ptr %p) #0 { ; CHECK-LSE-O0-NEXT: mvn w10, w9 ; CHECK-LSE-O0-NEXT: mov x9, x8 ; CHECK-LSE-O0-NEXT: casl w9, w10, [x11] -; CHECK-LSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-LSE-O0-NEXT: subs w8, w9, w8 ; CHECK-LSE-O0-NEXT: cset w8, eq +; CHECK-LSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-LSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-LSE-O0-NEXT: tbz w8, #0, LBB6_1 ; CHECK-LSE-O0-NEXT: b LBB6_2 @@ -649,9 +649,9 @@ define i64 @fetch_and_nand_64(ptr %p) #0 { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB7_2 ; CHECK-NOLSE-O0-NEXT: LBB7_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB7_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB7_1 ; CHECK-NOLSE-O0-NEXT: b LBB7_5 @@ -677,9 +677,9 @@ define i64 @fetch_and_nand_64(ptr %p) #0 { ; CHECK-OUTLINE-O0-NEXT: mvn x1, x8 ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas8_acq_rel ; CHECK-OUTLINE-O0-NEXT: ldr x8, [sp] ; 8-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs x8, x0, x8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #8] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #24] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB7_1 ; CHECK-OUTLINE-O0-NEXT: b LBB7_2 @@ -719,9 +719,9 @@ define i64 @fetch_and_nand_64(ptr %p) #0 { ; CHECK-LSE-O0-NEXT: mvn x10, x9 ; CHECK-LSE-O0-NEXT: mov x9, x8 ; CHECK-LSE-O0-NEXT: casal x9, x10, [x11] -; CHECK-LSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill ; CHECK-LSE-O0-NEXT: subs x8, x9, x8 ; CHECK-LSE-O0-NEXT: cset w8, eq +; CHECK-LSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill ; CHECK-LSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-LSE-O0-NEXT: tbz w8, #0, LBB7_1 ; CHECK-LSE-O0-NEXT: b LBB7_2 @@ -782,9 +782,9 @@ define i32 @fetch_and_or(ptr %p) #0 { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB8_2 ; CHECK-NOLSE-O0-NEXT: LBB8_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB8_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB8_1 ; CHECK-NOLSE-O0-NEXT: b LBB8_5 @@ -855,9 +855,9 @@ define i64 @fetch_and_or_64(ptr %p) #0 { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB9_2 ; CHECK-NOLSE-O0-NEXT: LBB9_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB9_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp, #8] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB9_1 ; CHECK-NOLSE-O0-NEXT: b LBB9_5 @@ -4005,9 +4005,9 @@ define i32 @atomicrmw_add_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB47_2 ; CHECK-NOLSE-O0-NEXT: LBB47_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB47_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB47_1 ; CHECK-NOLSE-O0-NEXT: b LBB47_5 @@ -4097,9 +4097,9 @@ define i32 @atomicrmw_xchg_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB48_2 ; CHECK-NOLSE-O0-NEXT: LBB48_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB48_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB48_1 ; CHECK-NOLSE-O0-NEXT: b LBB48_5 @@ -4190,9 +4190,9 @@ define i32 @atomicrmw_sub_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB49_2 ; CHECK-NOLSE-O0-NEXT: LBB49_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB49_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB49_1 ; CHECK-NOLSE-O0-NEXT: b LBB49_5 @@ -4287,9 +4287,9 @@ define i32 @atomicrmw_and_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB50_2 ; CHECK-NOLSE-O0-NEXT: LBB50_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB50_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB50_1 ; CHECK-NOLSE-O0-NEXT: b LBB50_5 @@ -4384,9 +4384,9 @@ define i32 @atomicrmw_or_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB51_2 ; CHECK-NOLSE-O0-NEXT: LBB51_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB51_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB51_1 ; CHECK-NOLSE-O0-NEXT: b LBB51_5 @@ -4477,9 +4477,9 @@ define i32 @atomicrmw_xor_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB52_2 ; CHECK-NOLSE-O0-NEXT: LBB52_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB52_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB52_1 ; CHECK-NOLSE-O0-NEXT: b LBB52_5 @@ -4572,9 +4572,9 @@ define i32 @atomicrmw_min_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB53_2 ; CHECK-NOLSE-O0-NEXT: LBB53_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB53_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB53_1 ; CHECK-NOLSE-O0-NEXT: b LBB53_5 @@ -4605,9 +4605,9 @@ define i32 @atomicrmw_min_i32(ptr %ptr, i32 %rhs) { ; CHECK-OUTLINE-O0-NEXT: csel w1, w0, w8, le ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas4_acq ; CHECK-OUTLINE-O0-NEXT: ldr w8, [sp, #8] ; 4-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs w8, w0, w8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #28] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB53_1 ; CHECK-OUTLINE-O0-NEXT: b LBB53_2 @@ -4686,9 +4686,9 @@ define i32 @atomicrmw_max_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB54_2 ; CHECK-NOLSE-O0-NEXT: LBB54_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB54_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB54_1 ; CHECK-NOLSE-O0-NEXT: b LBB54_5 @@ -4719,9 +4719,9 @@ define i32 @atomicrmw_max_i32(ptr %ptr, i32 %rhs) { ; CHECK-OUTLINE-O0-NEXT: csel w1, w0, w8, gt ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas4_rel ; CHECK-OUTLINE-O0-NEXT: ldr w8, [sp, #8] ; 4-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs w8, w0, w8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #28] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB54_1 ; CHECK-OUTLINE-O0-NEXT: b LBB54_2 @@ -4800,9 +4800,9 @@ define i32 @atomicrmw_umin_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB55_2 ; CHECK-NOLSE-O0-NEXT: LBB55_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB55_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB55_1 ; CHECK-NOLSE-O0-NEXT: b LBB55_5 @@ -4833,9 +4833,9 @@ define i32 @atomicrmw_umin_i32(ptr %ptr, i32 %rhs) { ; CHECK-OUTLINE-O0-NEXT: csel w1, w0, w8, ls ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas4_acq_rel ; CHECK-OUTLINE-O0-NEXT: ldr w8, [sp, #8] ; 4-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs w8, w0, w8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #28] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB55_1 ; CHECK-OUTLINE-O0-NEXT: b LBB55_2 @@ -4914,9 +4914,9 @@ define i32 @atomicrmw_umax_i32(ptr %ptr, i32 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB56_2 ; CHECK-NOLSE-O0-NEXT: LBB56_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB56_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs w8, w9, w8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str w9, [sp, #12] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str w9, [sp, #28] ; 4-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB56_1 ; CHECK-NOLSE-O0-NEXT: b LBB56_5 @@ -4947,9 +4947,9 @@ define i32 @atomicrmw_umax_i32(ptr %ptr, i32 %rhs) { ; CHECK-OUTLINE-O0-NEXT: csel w1, w0, w8, hi ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas4_relax ; CHECK-OUTLINE-O0-NEXT: ldr w8, [sp, #8] ; 4-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs w8, w0, w8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #12] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str w0, [sp, #28] ; 4-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB56_1 ; CHECK-OUTLINE-O0-NEXT: b LBB56_2 @@ -5026,9 +5026,9 @@ define i64 @atomicrmw_add_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB57_2 ; CHECK-NOLSE-O0-NEXT: LBB57_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB57_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB57_1 ; CHECK-NOLSE-O0-NEXT: b LBB57_5 @@ -5117,9 +5117,9 @@ define i64 @atomicrmw_xchg_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB58_2 ; CHECK-NOLSE-O0-NEXT: LBB58_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB58_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB58_1 ; CHECK-NOLSE-O0-NEXT: b LBB58_5 @@ -5210,9 +5210,9 @@ define i64 @atomicrmw_sub_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB59_2 ; CHECK-NOLSE-O0-NEXT: LBB59_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB59_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB59_1 ; CHECK-NOLSE-O0-NEXT: b LBB59_5 @@ -5307,9 +5307,9 @@ define i64 @atomicrmw_and_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB60_2 ; CHECK-NOLSE-O0-NEXT: LBB60_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB60_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB60_1 ; CHECK-NOLSE-O0-NEXT: b LBB60_5 @@ -5404,9 +5404,9 @@ define i64 @atomicrmw_or_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB61_2 ; CHECK-NOLSE-O0-NEXT: LBB61_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB61_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB61_1 ; CHECK-NOLSE-O0-NEXT: b LBB61_5 @@ -5497,9 +5497,9 @@ define i64 @atomicrmw_xor_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB62_2 ; CHECK-NOLSE-O0-NEXT: LBB62_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB62_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB62_1 ; CHECK-NOLSE-O0-NEXT: b LBB62_5 @@ -5592,9 +5592,9 @@ define i64 @atomicrmw_min_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB63_2 ; CHECK-NOLSE-O0-NEXT: LBB63_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB63_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB63_1 ; CHECK-NOLSE-O0-NEXT: b LBB63_5 @@ -5625,9 +5625,9 @@ define i64 @atomicrmw_min_i64(ptr %ptr, i64 %rhs) { ; CHECK-OUTLINE-O0-NEXT: csel x1, x0, x8, le ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas8_acq ; CHECK-OUTLINE-O0-NEXT: ldr x8, [sp, #8] ; 8-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs x8, x0, x8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #40] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB63_1 ; CHECK-OUTLINE-O0-NEXT: b LBB63_2 @@ -5706,9 +5706,9 @@ define i64 @atomicrmw_max_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB64_2 ; CHECK-NOLSE-O0-NEXT: LBB64_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB64_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB64_1 ; CHECK-NOLSE-O0-NEXT: b LBB64_5 @@ -5739,9 +5739,9 @@ define i64 @atomicrmw_max_i64(ptr %ptr, i64 %rhs) { ; CHECK-OUTLINE-O0-NEXT: csel x1, x0, x8, gt ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas8_rel ; CHECK-OUTLINE-O0-NEXT: ldr x8, [sp, #8] ; 8-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs x8, x0, x8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #40] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB64_1 ; CHECK-OUTLINE-O0-NEXT: b LBB64_2 @@ -5820,9 +5820,9 @@ define i64 @atomicrmw_umin_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB65_2 ; CHECK-NOLSE-O0-NEXT: LBB65_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB65_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB65_1 ; CHECK-NOLSE-O0-NEXT: b LBB65_5 @@ -5853,9 +5853,9 @@ define i64 @atomicrmw_umin_i64(ptr %ptr, i64 %rhs) { ; CHECK-OUTLINE-O0-NEXT: csel x1, x0, x8, ls ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas8_acq_rel ; CHECK-OUTLINE-O0-NEXT: ldr x8, [sp, #8] ; 8-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs x8, x0, x8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #40] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB65_1 ; CHECK-OUTLINE-O0-NEXT: b LBB65_2 @@ -5934,9 +5934,9 @@ define i64 @atomicrmw_umax_i64(ptr %ptr, i64 %rhs) { ; CHECK-NOLSE-O0-NEXT: cbnz w10, LBB66_2 ; CHECK-NOLSE-O0-NEXT: LBB66_4: ; %atomicrmw.start ; CHECK-NOLSE-O0-NEXT: ; in Loop: Header=BB66_1 Depth=1 -; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: subs x8, x9, x8 ; CHECK-NOLSE-O0-NEXT: cset w8, eq +; CHECK-NOLSE-O0-NEXT: str x9, [sp] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: str x9, [sp, #24] ; 8-byte Folded Spill ; CHECK-NOLSE-O0-NEXT: tbz w8, #0, LBB66_1 ; CHECK-NOLSE-O0-NEXT: b LBB66_5 @@ -5967,9 +5967,9 @@ define i64 @atomicrmw_umax_i64(ptr %ptr, i64 %rhs) { ; CHECK-OUTLINE-O0-NEXT: csel x1, x0, x8, hi ; CHECK-OUTLINE-O0-NEXT: bl ___aarch64_cas8_relax ; CHECK-OUTLINE-O0-NEXT: ldr x8, [sp, #8] ; 8-byte Folded Reload -; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: subs x8, x0, x8 ; CHECK-OUTLINE-O0-NEXT: cset w8, eq +; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #16] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: str x0, [sp, #40] ; 8-byte Folded Spill ; CHECK-OUTLINE-O0-NEXT: tbz w8, #0, LBB66_1 ; CHECK-OUTLINE-O0-NEXT: b LBB66_2 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll index 8aea944b55c2d..ceef0c49a45ec 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -65,22 +65,17 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) { ; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h ; GISEL-NEXT: umull2 v3.4s, v1.8h, v2.8h ; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h -; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_1] +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] ; GISEL-NEXT: adrp x8, .LCPI1_0 -; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h -; GISEL-NEXT: umull2 v4.4s, v2.8h, v3.8h -; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI1_0] -; GISEL-NEXT: adrp x8, .LCPI1_4 -; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI1_4] -; GISEL-NEXT: add v1.8h, v2.8h, v1.8h -; GISEL-NEXT: neg v2.8h, v3.8h -; GISEL-NEXT: movi v3.8h, #1 -; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h -; GISEL-NEXT: cmeq v2.8h, v4.8h, v3.8h -; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b +; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h +; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h +; GISEL-NEXT: umull2 v3.4s, v0.8h, v2.8h +; GISEL-NEXT: umull v0.4s, v0.4h, v2.4h +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI1_0] +; GISEL-NEXT: uzp2 v0.8h, v0.8h, v3.8h +; GISEL-NEXT: add v0.8h, v0.8h, v1.8h +; GISEL-NEXT: neg v1.8h, v2.8h +; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -107,21 +102,16 @@ define <8 x i16> @combine_vec_udiv_nonuniform2(<8 x i16> %x) { ; GISEL-NEXT: adrp x8, .LCPI2_2 ; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_2] ; GISEL-NEXT: adrp x8, .LCPI2_1 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; GISEL-NEXT: neg v1.8h, v1.8h +; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_1] ; GISEL-NEXT: adrp x8, .LCPI2_0 +; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h +; GISEL-NEXT: umull v0.4s, v0.4h, v1.4h +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; GISEL-NEXT: neg v1.8h, v1.8h -; GISEL-NEXT: ushl v1.8h, v0.8h, v1.8h -; GISEL-NEXT: umull2 v3.4s, v1.8h, v2.8h -; GISEL-NEXT: umull v1.4s, v1.4h, v2.4h -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI2_0] -; GISEL-NEXT: adrp x8, .LCPI2_3 -; GISEL-NEXT: neg v2.8h, v2.8h -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI2_3] -; GISEL-NEXT: uzp2 v1.8h, v1.8h, v3.8h -; GISEL-NEXT: movi v3.8h, #1 -; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h -; GISEL-NEXT: cmeq v2.8h, v4.8h, v3.8h -; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b +; GISEL-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; GISEL-NEXT: ushl v0.8h, v0.8h, v1.8h ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -145,21 +135,16 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) { ; GISEL-LABEL: combine_vec_udiv_nonuniform3: ; GISEL: // %bb.0: ; GISEL-NEXT: adrp x8, .LCPI3_1 -; GISEL-NEXT: movi v3.8h, #1 ; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI3_1] ; GISEL-NEXT: adrp x8, .LCPI3_0 ; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h ; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h ; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h -; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h -; GISEL-NEXT: usra v1.8h, v2.8h, #1 -; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; GISEL-NEXT: adrp x8, .LCPI3_2 -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI3_2] -; GISEL-NEXT: neg v2.8h, v2.8h -; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h -; GISEL-NEXT: cmeq v2.8h, v4.8h, v3.8h -; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b +; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h +; GISEL-NEXT: usra v1.8h, v0.8h, #1 +; GISEL-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] +; GISEL-NEXT: neg v0.8h, v0.8h +; GISEL-NEXT: ushl v0.8h, v1.8h, v0.8h ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, ret <8 x i16> %1 @@ -184,19 +169,19 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; ; GISEL-LABEL: combine_vec_udiv_nonuniform4: ; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI4_2 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_2] ; GISEL-NEXT: adrp x8, .LCPI4_1 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_1] +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI4_1] ; GISEL-NEXT: adrp x8, .LCPI4_0 -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI4_0] -; GISEL-NEXT: adrp x8, .LCPI4_2 ; GISEL-NEXT: umull2 v2.8h, v0.16b, v1.16b ; GISEL-NEXT: umull v1.8h, v0.8b, v1.8b -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_2] +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_0] ; GISEL-NEXT: uzp2 v1.16b, v1.16b, v2.16b ; GISEL-NEXT: neg v2.16b, v3.16b -; GISEL-NEXT: movi v3.16b, #1 +; GISEL-NEXT: shl v3.16b, v4.16b, #7 ; GISEL-NEXT: ushl v1.16b, v1.16b, v2.16b -; GISEL-NEXT: cmeq v2.16b, v4.16b, v3.16b +; GISEL-NEXT: sshr v2.16b, v3.16b, #7 ; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b ; GISEL-NEXT: ret %div = udiv <16 x i8> %x, @@ -232,10 +217,10 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; ; GISEL-LABEL: pr38477: ; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, .LCPI5_3 +; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_3] ; GISEL-NEXT: adrp x8, .LCPI5_2 -; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI5_2] -; GISEL-NEXT: adrp x8, .LCPI5_1 -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_1] +; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] ; GISEL-NEXT: adrp x8, .LCPI5_0 ; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h ; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h @@ -243,15 +228,16 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; GISEL-NEXT: sub v2.8h, v0.8h, v1.8h ; GISEL-NEXT: umull2 v4.4s, v2.8h, v3.8h ; GISEL-NEXT: umull v2.4s, v2.4h, v3.4h -; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI5_0] -; GISEL-NEXT: adrp x8, .LCPI5_3 +; GISEL-NEXT: ldr d3, [x8, :lo12:.LCPI5_0] +; GISEL-NEXT: adrp x8, .LCPI5_1 +; GISEL-NEXT: ushll v3.8h, v3.8b, #0 ; GISEL-NEXT: uzp2 v2.8h, v2.8h, v4.8h -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI5_3] +; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI5_1] +; GISEL-NEXT: shl v3.8h, v3.8h, #15 ; GISEL-NEXT: add v1.8h, v2.8h, v1.8h -; GISEL-NEXT: neg v2.8h, v3.8h -; GISEL-NEXT: movi v3.8h, #1 +; GISEL-NEXT: neg v2.8h, v4.8h ; GISEL-NEXT: ushl v1.8h, v1.8h, v2.8h -; GISEL-NEXT: cmeq v2.8h, v4.8h, v3.8h +; GISEL-NEXT: sshr v2.8h, v3.8h, #15 ; GISEL-NEXT: bif v0.16b, v1.16b, v2.16b ; GISEL-NEXT: ret %1 = udiv <8 x i16> %a0, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir index ee33b9c50cbea..02233b9f498bd 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir @@ -6,7 +6,9 @@ body: | bb.1: liveins: $w0 ; CHECK-LABEL: name: udiv_by_scalar_const - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 818089009 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 @@ -68,44 +70,32 @@ body: | ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 23 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 34 - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -23 - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 56 - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 128 - ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 - ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -256 - ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) - ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 - ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 - ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3855 - ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 - ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 8195 - ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 13 - ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 3 - ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363 - ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 512 - ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767 - ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 - ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32639 - ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C15]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C11]](s16), [[C13]](s16), [[C16]](s16), [[C17]](s16), [[C18]](s16), [[C20]](s16), [[C21]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C7]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C10]](s16), [[C12]](s16), [[C14]](s16), [[C8]](s16), [[C8]](s16), [[C19]](s16), [[C19]](s16), [[C8]](s16) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR1]](<8 x s16>) - ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3855 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 8195 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 13 + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 3 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 512 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32767 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32639 + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C8]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C4]](s16), [[C6]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16), [[C13]](s16), [[C14]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C7]](s16), [[C1]](s16), [[C1]](s16), [[C12]](s16), [[C12]](s16), [[C1]](s16) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<8 x s16>) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR1]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] - ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR3]] + ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]] ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]] - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR4]](<8 x s16>) - ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16), [[C22]](s16) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR5]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]] - ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>) + ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<8 x s16>) = COPY $q0 %2:_(s16) = G_CONSTANT i16 23 @@ -136,38 +126,26 @@ body: | ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 -34 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 35 - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 36 - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -37 - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 38 - ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -39 - ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 40 - ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -41 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) - ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 16393 - ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 13 - ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -5617 - ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 - ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 -7281 - ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32749 - ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 - ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 -10347 - ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 8197 - ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 -13107 - ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32747 - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16), [[C10]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C12]](s16), [[C14]](s16), [[C15]](s16), [[C17]](s16), [[C18]](s16), [[C19]](s16), [[C20]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C11]](s16), [[C13]](s16), [[C13]](s16), [[C16]](s16), [[C13]](s16), [[C11]](s16), [[C13]](s16), [[C16]](s16) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR1]](<8 x s16>) - ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR2]] - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR3]](<8 x s16>) - ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]] - ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 16393 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 13 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -5617 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 -7281 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32749 + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -10347 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 8197 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 -13107 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32747 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C4]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16), [[C12]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C5]](s16), [[C5]](s16), [[C8]](s16), [[C5]](s16), [[C3]](s16), [[C5]](s16), [[C8]](s16) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<8 x s16>) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[LSHR]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[UMULH]], [[BUILD_VECTOR2]](<8 x s16>) + ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<8 x s16>) = COPY $q0 %2:_(s16) = G_CONSTANT i16 -34 @@ -198,39 +176,28 @@ body: | ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 7 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 23 - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25 - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 27 - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 31 - ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 47 - ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 63 - ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 127 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) - ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363 - ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 - ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 - ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 - ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 18351 - ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 12137 - ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115 - ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 23705 - ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 - ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041 - ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 517 - ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 6 - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C10]](s16), [[C12]](s16), [[C13]](s16), [[C14]](s16), [[C15]](s16), [[C17]](s16), [[C18]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C9]](s16), [[C11]](s16), [[C11]](s16), [[C11]](s16), [[C11]](s16), [[C16]](s16), [[C16]](s16), [[C19]](s16) - ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 9363 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 25645 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 18351 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 12137 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 23705 + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 5 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 1041 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 517 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 6 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C8]](s16), [[C8]](s16), [[C11]](s16) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] - ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16), [[C20]](s16) - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR3]](<8 x s16>) + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16), [[C12]](s16) + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[SUB]], [[BUILD_VECTOR2]](<8 x s16>) ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[LSHR]], [[UMULH]] - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR3]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR1]] - ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) + ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR1]](<8 x s16>) + ; CHECK-NEXT: $q0 = COPY [[LSHR1]](<8 x s16>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<8 x s16>) = COPY $q0 %2:_(s16) = G_CONSTANT i16 7 @@ -261,19 +228,17 @@ body: | ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -64 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 1 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s8) = G_CONSTANT i8 -85 - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s8) = G_CONSTANT i8 7 - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C3]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C4]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8), [[C2]](s8) - ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<16 x s8>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]] - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<16 x s8>) = G_LSHR [[UMULH]], [[BUILD_VECTOR2]](<16 x s8>) - ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8), [[C1]](s8) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<16 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<16 x s8>), [[BUILD_VECTOR3]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<16 x s8>) = G_SELECT [[ICMP]](<16 x s1>), [[COPY]], [[LSHR]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 -85 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s8) = G_CONSTANT i8 7 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C1]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<16 x s8>) = G_BUILD_VECTOR [[C2]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8), [[C]](s8) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<16 x s8>) = G_UMULH [[COPY]], [[BUILD_VECTOR]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<16 x s8>) = G_LSHR [[UMULH]], [[BUILD_VECTOR1]](<16 x s8>) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<16 x s1>) = G_BUILD_VECTOR [[C3]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1), [[C4]](s1) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<16 x s8>) = G_SELECT [[BUILD_VECTOR2]](<16 x s1>), [[COPY]], [[LSHR]] ; CHECK-NEXT: $q0 = COPY [[SELECT]](<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<16 x s8>) = COPY $q0 @@ -299,39 +264,31 @@ body: | ; CHECK: liveins: $q0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 119 - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 73 - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -111 - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3 - ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 118 - ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 32 - ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 31 - ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C2]](s16), [[C3]](s16), [[C4]](s16), [[C5]](s16), [[C6]](s16), [[C7]](s16) - ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 - ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 4957 - ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 - ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 6 - ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 -8079 - ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 4103 - ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s16) = G_CONSTANT i16 12 - ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 16385 - ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 14 - ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s16) = G_CONSTANT i16 -29991 - ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s16) = G_CONSTANT i16 2048 - ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115 - ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 - ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C9]](s16), [[C12]](s16), [[C13]](s16), [[C15]](s16), [[C17]](s16), [[C18]](s16), [[C19]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C10]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C8]](s16), [[C10]](s16) - ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C8]](s16), [[C11]](s16), [[C11]](s16), [[C14]](s16), [[C16]](s16), [[C11]](s16), [[C8]](s16), [[C20]](s16) - ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 4957 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -32768 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 6 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 -8079 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 4103 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 12 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 16385 + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 14 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 -29991 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 2048 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 2115 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 4 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C1]](s16), [[C4]](s16), [[C5]](s16), [[C7]](s16), [[C9]](s16), [[C10]](s16), [[C11]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C2]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C2]](s16) + ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C3]](s16), [[C3]](s16), [[C6]](s16), [[C8]](s16), [[C3]](s16), [[C]](s16), [[C12]](s16) + ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[COPY]], [[BUILD_VECTOR]] ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[COPY]], [[UMULH]] - ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR2]] + ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(<8 x s16>) = G_UMULH [[SUB]], [[BUILD_VECTOR1]] ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UMULH1]], [[UMULH]] - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR3]](<8 x s16>) - ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s1>) = G_ICMP intpred(eq), [[BUILD_VECTOR]](<8 x s16>), [[BUILD_VECTOR4]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[ICMP]](<8 x s1>), [[COPY]], [[LSHR]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[ADD]], [[BUILD_VECTOR2]](<8 x s16>) + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s1) = G_CONSTANT i1 false + ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s1>) = G_BUILD_VECTOR [[C13]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1), [[C14]](s1) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(<8 x s16>) = G_SELECT [[BUILD_VECTOR3]](<8 x s1>), [[COPY]], [[LSHR]] ; CHECK-NEXT: $q0 = COPY [[SELECT]](<8 x s16>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %0:_(<8 x s16>) = COPY $q0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-trunc.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-trunc.ll new file mode 100644 index 0000000000000..d87e9c4b18550 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-trunc.ll @@ -0,0 +1,90 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -O0 -mtriple=aarch64-linux-gnu -global-isel -stop-after=irtranslator %s -o - | FileCheck %s + +define i32 @call_trunc_no_flags(i64 %a) { + ; CHECK-LABEL: name: call_trunc_no_flags + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %result = trunc i64 %a to i32 + ret i32 %result +} + +define i32 @call_trunc_nsw_flags(i64 %a) { + ; CHECK-LABEL: name: call_trunc_nsw_flags + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nsw G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %result = trunc nsw i64 %a to i32 + ret i32 %result +} + +define i32 @call_trunc_nuw_flags(i64 %a) { + ; CHECK-LABEL: name: call_trunc_nuw_flags + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nuw G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %result = trunc nuw i64 %a to i32 + ret i32 %result +} + +define i32 @call_trunc_all_flags(i64 %a) { + ; CHECK-LABEL: name: call_trunc_all_flags + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = nuw nsw G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 +entry: + %result = trunc nsw nuw i64 %a to i32 + ret i32 %result +} + +define <2 x i64> @call_trunc_noop_signed_vector(<2 x i64> %a) { + ; CHECK-LABEL: name: call_trunc_noop_signed_vector + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = nsw G_TRUNC [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<2 x s64>) = G_SEXT [[TRUNC]](<2 x s32>) + ; CHECK-NEXT: $q0 = COPY [[SEXT]](<2 x s64>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 +entry: + %truncate = trunc nsw <2 x i64> %a to <2 x i32> + %result = sext <2 x i32> %truncate to <2 x i64> + ret <2 x i64> %result +} + +define <2 x i64> @call_trunc_noop_unsigned_vector(<2 x i64> %a) { + ; CHECK-LABEL: name: call_trunc_noop_unsigned_vector + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = nuw G_TRUNC [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<2 x s64>) = G_ZEXT [[TRUNC]](<2 x s32>) + ; CHECK-NEXT: $q0 = COPY [[ZEXT]](<2 x s64>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 +entry: + %truncate = trunc nuw <2 x i64> %a to <2 x i32> + %result = zext <2 x i32> %truncate to <2 x i64> + ret <2 x i64> %result +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir index 6a6e0b63b103a..26230efbbe863 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-128.mir @@ -12,22 +12,6 @@ body: | liveins: $x0, $x1, $x2, $x3, $x4 - ; CHECK-LABEL: name: compare_swap_128 - ; CHECK: liveins: $x0_x1, $x1, $x0, $x1, $x2, $x3, $x4 - ; CHECK: [[COPY:%[0-9]+]]:gpr64(p0) = COPY $x0 - ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 - ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 - ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3 - ; CHECK: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4 - ; CHECK: [[COPY5:%[0-9]+]]:gpr64(s64) = COPY [[COPY1]](s64) - ; CHECK: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64) - ; CHECK: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64) - ; CHECK: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64) - ; CHECK: early-clobber %13:gpr64(s64), early-clobber %14:gpr64(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire 16) - ; CHECK: [[COPY9:%[0-9]+]]:gpr64 = COPY %16 - ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64) - ; CHECK: G_STORE [[MV]](s128), [[COPY]](p0) :: (store 16) - ; CHECK: RET_ReallyLR ; CHECK-NOLSE-LABEL: name: compare_swap_128 ; CHECK-NOLSE: liveins: $x0_x1, $x1, $x0, $x1, $x2, $x3, $x4 ; CHECK-NOLSE-NEXT: {{ $}} @@ -40,11 +24,13 @@ body: | ; CHECK-NOLSE-NEXT: [[COPY6:%[0-9]+]]:gpr64(s64) = COPY [[COPY2]](s64) ; CHECK-NOLSE-NEXT: [[COPY7:%[0-9]+]]:gpr64(s64) = COPY [[COPY3]](s64) ; CHECK-NOLSE-NEXT: [[COPY8:%[0-9]+]]:gpr64(s64) = COPY [[COPY4]](s64) - ; CHECK-NOLSE-NEXT: early-clobber %13:gpr64common(s64), early-clobber %14:gpr64common(s64), early-clobber %16:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128)) - ; CHECK-NOLSE-NEXT: [[COPY9:%[0-9]+]]:gpr64 = COPY %16 - ; CHECK-NOLSE-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %13(s64), %14(s64) - ; CHECK-NOLSE-NEXT: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128)) + ; CHECK-NOLSE-NEXT: early-clobber %14:gpr64common(s64), early-clobber %15:gpr64common(s64), early-clobber %17:gpr32common = CMP_SWAP_128_ACQUIRE [[COPY]](p0), [[COPY5]](s64), [[COPY6]](s64), [[COPY7]](s64), [[COPY8]](s64) :: (load store acquire acquire (s128)) + ; CHECK-NOLSE-NEXT: [[COPY9:%[0-9]+]]:gpr64 = COPY %17 + ; CHECK-NOLSE-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES %14(s64), %15(s64) + ; CHECK-NOLSE-NEXT: [[COPY10:%[0-9]+]]:_(s128) = COPY [[MV]](s128) + ; CHECK-NOLSE-NEXT: G_STORE [[COPY10]](s128), [[COPY]](p0) :: (store (s128)) ; CHECK-NOLSE-NEXT: RET_ReallyLR + ; ; CHECK-LSE-LABEL: name: compare_swap_128 ; CHECK-LSE: liveins: $x0_x1, $x1, $x0, $x1, $x2, $x3, $x4 ; CHECK-LSE-NEXT: {{ $}} @@ -59,7 +45,8 @@ body: | ; CHECK-LSE-NEXT: [[EXTRACT:%[0-9]+]]:_(s64) = G_EXTRACT [[CASPAX]](s128), 0 ; CHECK-LSE-NEXT: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[CASPAX]](s128), 64 ; CHECK-LSE-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[EXTRACT]](s64), [[EXTRACT1]](s64) - ; CHECK-LSE-NEXT: G_STORE [[MV]](s128), [[COPY]](p0) :: (store (s128)) + ; CHECK-LSE-NEXT: [[COPY5:%[0-9]+]]:_(s128) = COPY [[MV]](s128) + ; CHECK-LSE-NEXT: G_STORE [[COPY5]](s128), [[COPY]](p0) :: (store (s128)) ; CHECK-LSE-NEXT: RET_ReallyLR %0:_(p0) = COPY $x0 %3:_(s64) = COPY $x1 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-with-success.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-with-success.mir index 3c010789a2b7f..05e6212af0620 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-with-success.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmpxchg-with-success.mir @@ -16,13 +16,16 @@ body: | liveins: $x0 ; CHECK-LABEL: name: cmpxchg_i32 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p0), [[C]], [[C1]] :: (load store monotonic (s64) on %ir.addr) - ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[C]] - ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[ATOMIC_CMPXCHG]], [[ICMP]] - ; CHECK: $w0 = COPY [[MUL]](s32) + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p0), [[C]], [[C1]] :: (load store monotonic (s64) on %ir.addr) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[C]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[ICMP]] + ; CHECK-NEXT: $w0 = COPY [[MUL]](s32) %0:_(p0) = COPY $x0 %1:_(s32) = G_CONSTANT i32 0 %2:_(s32) = G_CONSTANT i32 1 @@ -40,14 +43,17 @@ body: | liveins: $x0 ; CHECK-LABEL: name: cmpxchg_i64 - ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_ATOMIC_CMPXCHG [[COPY]](p0), [[C]], [[C1]] :: (load store monotonic (s64) on %ir.addr) - ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s64), [[C]] - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP]](s32) - ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ATOMIC_CMPXCHG]], [[ANYEXT]] - ; CHECK: $x0 = COPY [[MUL]](s64) + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_ATOMIC_CMPXCHG [[COPY]](p0), [[C]], [[C1]] :: (load store monotonic (s64) on %ir.addr) + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s64), [[C]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[ATOMIC_CMPXCHG]](s64) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP]](s32) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY1]], [[ANYEXT]] + ; CHECK-NEXT: $x0 = COPY [[MUL]](s64) %0:_(p0) = COPY $x0 %1:_(s64) = G_CONSTANT i64 0 %2:_(s64) = G_CONSTANT i64 1 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop-no-implicit-float.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop-no-implicit-float.mir index d2352be81503d..27f2f0bafa95a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop-no-implicit-float.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop-no-implicit-float.mir @@ -37,6 +37,7 @@ body: | ; CHECK-NEXT: %ctpop:_(s32) = G_LSHR [[MUL]], [[C7]](s64) ; CHECK-NEXT: $w0 = COPY %ctpop(s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 + ; ; CHECK-CSSC-LABEL: name: s32 ; CHECK-CSSC: liveins: $w0 ; CHECK-CSSC-NEXT: {{ $}} @@ -77,11 +78,12 @@ body: | ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1085102592571150095 ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C5]] ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673 - ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C6]] ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C6]] ; CHECK-NEXT: %ctpop:_(s64) = G_LSHR [[MUL]], [[C7]](s64) ; CHECK-NEXT: $x0 = COPY %ctpop(s64) ; CHECK-NEXT: RET_ReallyLR implicit $x0 + ; ; CHECK-CSSC-LABEL: name: s64 ; CHECK-CSSC: liveins: $x0 ; CHECK-CSSC-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select.mir index e207a31063bac..b3613f52c4ec6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -O0 -mtriple=aarch64-apple-ios -run-pass=instruction-select -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=IOS # RUN: llc -O0 -mtriple=aarch64-linux-gnu -relocation-model=pic -run-pass=instruction-select -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-PIC @@ -26,40 +27,35 @@ ... --- -# CHECK-LABEL: name: frame_index name: frame_index legalized: true regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr64sp, preferred-register: '' } -registers: - - { id: 0, class: gpr } - stack: - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 } - -# CHECK: body: -# CHECK: %0:gpr64sp = ADDXri %stack.0.ptr0, 0, 0 body: | bb.0: - %0(p0) = G_FRAME_INDEX %stack.0.ptr0 + ; CHECK-LABEL: name: frame_index + ; CHECK: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0.ptr0, 0, 0 + ; CHECK-NEXT: $x0 = COPY [[ADDXri]] + %0:gpr(p0) = G_FRAME_INDEX %stack.0.ptr0 $x0 = COPY %0(p0) ... --- --- -# CHECK-LABEL: name: ptr_mask name: ptr_mask legalized: true regBankSelected: true - -# CHECK: body: -# CHECK: %2:gpr64sp = ANDXri %0, 8060 body: | bb.0: liveins: $x0 + ; CHECK-LABEL: name: ptr_mask + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[ANDXri:%[0-9]+]]:gpr64sp = ANDXri [[COPY]], 8060 + ; CHECK-NEXT: $x0 = COPY [[ANDXri]] %0:gpr(p0) = COPY $x0 %const:gpr(s64) = G_CONSTANT i64 -8 %1:gpr(p0) = G_PTRMASK %0, %const @@ -68,200 +64,171 @@ body: | --- # Global defined in the same linkage unit so no GOT is needed -# CHECK-LABEL: name: global_local name: global_local legalized: true regBankSelected: true -registers: - - { id: 0, class: gpr } - -# CHECK: body: -# IOS: %0:gpr64common = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local -# LINUX-PIC: %0:gpr64common = LOADgot target-flags(aarch64-got) @var_local body: | bb.0: - %0(p0) = G_GLOBAL_VALUE @var_local + ; IOS-LABEL: name: global_local + ; IOS: [[MOVaddr:%[0-9]+]]:gpr64common = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local + ; IOS-NEXT: $x0 = COPY [[MOVaddr]] + ; + ; LINUX-PIC-LABEL: name: global_local + ; LINUX-PIC: [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @var_local + ; LINUX-PIC-NEXT: $x0 = COPY [[LOADgot]] + %0:gpr(p0) = G_GLOBAL_VALUE @var_local $x0 = COPY %0(p0) ... --- -# CHECK-LABEL: name: global_got name: global_got legalized: true regBankSelected: true -registers: - - { id: 0, class: gpr } - -# CHECK: body: -# IOS: %0:gpr64common = LOADgot target-flags(aarch64-got) @var_got -# LINUX-PIC: %0:gpr64common = LOADgot target-flags(aarch64-got) @var_got body: | bb.0: - %0(p0) = G_GLOBAL_VALUE @var_got + ; CHECK-LABEL: name: global_got + ; CHECK: [[LOADgot:%[0-9]+]]:gpr64common = LOADgot target-flags(aarch64-got) @var_got + ; CHECK-NEXT: $x0 = COPY [[LOADgot]] + %0:gpr(p0) = G_GLOBAL_VALUE @var_got $x0 = COPY %0(p0) ... --- -# CHECK-LABEL: name: icmp name: icmp legalized: true regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 2, class: gpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 3, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 4, class: gpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 5, class: gpr32, preferred-register: '' } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - - { id: 5, class: gpr } - - { id: 6, class: gpr } - - { id: 7, class: gpr } - - { id: 8, class: gpr } - - { id: 9, class: gpr } - - { id: 10, class: gpr } - - { id: 11, class: gpr } - -# CHECK: body: -# CHECK: SUBSWrr %0, %0, implicit-def $nzcv -# CHECK: %1:gpr32 = CSINCWr $wzr, $wzr, 1, implicit $nzcv - -# CHECK: SUBSXrr %2, %2, implicit-def $nzcv -# CHECK: %3:gpr32 = CSINCWr $wzr, $wzr, 3, implicit $nzcv - -# CHECK: SUBSXrr %4, %4, implicit-def $nzcv -# CHECK: %5:gpr32 = CSINCWr $wzr, $wzr, 0, implicit $nzcv - body: | bb.0: liveins: $w0, $x0 - %0(s32) = COPY $w0 - %1(s32) = G_ICMP intpred(eq), %0, %0 - %6(s8) = G_TRUNC %1(s32) - %9(s32) = G_ANYEXT %6 + ; CHECK-LABEL: name: icmp + ; CHECK: liveins: $w0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[COPY]], [[COPY]], implicit-def $nzcv + ; CHECK-NEXT: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 1, implicit $nzcv + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY [[CSINCWr]] + ; CHECK-NEXT: $w0 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[SUBSXrr:%[0-9]+]]:gpr64 = SUBSXrr [[COPY2]], [[COPY2]], implicit-def $nzcv + ; CHECK-NEXT: [[CSINCWr1:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 3, implicit $nzcv + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[CSINCWr1]] + ; CHECK-NEXT: $w0 = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[SUBSXrr1:%[0-9]+]]:gpr64 = SUBSXrr [[COPY4]], [[COPY4]], implicit-def $nzcv + ; CHECK-NEXT: [[CSINCWr2:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 0, implicit $nzcv + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[CSINCWr2]] + ; CHECK-NEXT: $w0 = COPY [[COPY5]] + %0:gpr(s32) = COPY $w0 + %1:gpr(s32) = G_ICMP intpred(eq), %0, %0 + %6:gpr(s8) = G_TRUNC %1(s32) + %9:gpr(s32) = G_ANYEXT %6 $w0 = COPY %9(s32) - %2(s64) = COPY $x0 - %3(s32) = G_ICMP intpred(uge), %2, %2 - %7(s8) = G_TRUNC %3(s32) - %10(s32) = G_ANYEXT %7 + %2:gpr(s64) = COPY $x0 + %3:gpr(s32) = G_ICMP intpred(uge), %2, %2 + %7:gpr(s8) = G_TRUNC %3(s32) + %10:gpr(s32) = G_ANYEXT %7 $w0 = COPY %10(s32) - %4(p0) = COPY $x0 - %5(s32) = G_ICMP intpred(ne), %4, %4 - %8(s8) = G_TRUNC %5(s32) - %11(s32) = G_ANYEXT %8 + %4:gpr(p0) = COPY $x0 + %5:gpr(s32) = G_ICMP intpred(ne), %4, %4 + %8:gpr(s8) = G_TRUNC %5(s32) + %11:gpr(s32) = G_ANYEXT %8 $w0 = COPY %11(s32) ... --- -# CHECK-LABEL: name: fcmp name: fcmp legalized: true regBankSelected: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 2, class: fpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 3, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 4, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 5, class: gpr32, preferred-register: '' } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - - { id: 2, class: fpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - - { id: 5, class: gpr } - - { id: 6, class: gpr } - - { id: 7, class: gpr } - - { id: 8, class: fpr } - - { id: 9, class: gpr } - - { id: 10, class: fpr } - - { id: 11, class: gpr } - - { id: 12, class: gpr } - - { id: 13, class: gpr } - - { id: 14, class: gpr } - - { id: 15, class: gpr } - -# CHECK: body: -# CHECK: nofpexcept FCMPSrr %0, %0, implicit-def $nzcv -# CHECK: [[TST_MI:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 5, implicit $nzcv -# CHECK: [[TST_GT:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 13, implicit $nzcv -# CHECK: %1:gpr32 = ORRWrr [[TST_MI]], [[TST_GT]] - -# CHECK: nofpexcept FCMPDrr %2, %2, implicit-def $nzcv -# CHECK: %3:gpr32 = CSINCWr $wzr, $wzr, 4, implicit $nzcv - body: | bb.0: liveins: $w0, $x0 - %0(s32) = COPY $s0 - %1(s32) = G_FCMP floatpred(one), %0, %0 - %4(s8) = G_TRUNC %1(s32) - %6(s32) = G_ANYEXT %4 - $w0 = COPY %6(s32) + ; CHECK-LABEL: name: fcmp + ; CHECK: liveins: $w0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK-NEXT: nofpexcept FCMPSrr [[COPY]], [[COPY]], implicit-def $nzcv, implicit $fpcr + ; CHECK-NEXT: [[CSINCWr:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 5, implicit $nzcv + ; CHECK-NEXT: [[CSINCWr1:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 13, implicit $nzcv + ; CHECK-NEXT: [[ORRWrr:%[0-9]+]]:gpr32 = ORRWrr [[CSINCWr]], [[CSINCWr1]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32all = COPY [[ORRWrr]] + ; CHECK-NEXT: $w0 = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK-NEXT: nofpexcept FCMPDrr [[COPY2]], [[COPY2]], implicit-def $nzcv, implicit $fpcr + ; CHECK-NEXT: [[CSINCWr2:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 4, implicit $nzcv + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[CSINCWr2]] + ; CHECK-NEXT: $w0 = COPY [[COPY3]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK-NEXT: nofpexcept FCMPSrr [[COPY4]], [[COPY4]], implicit-def $nzcv, implicit $fpcr + ; CHECK-NEXT: [[CSINCWr3:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 15, implicit $nzcv + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[CSINCWr3]] + ; CHECK-NEXT: $w0 = COPY [[COPY5]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:fpr64 = COPY $d0 + ; CHECK-NEXT: nofpexcept FCMPDrr [[COPY6]], [[COPY6]], implicit-def $nzcv, implicit $fpcr + ; CHECK-NEXT: [[CSINCWr4:%[0-9]+]]:gpr32 = CSINCWr $wzr, $wzr, 14, implicit $nzcv + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr32all = COPY [[CSINCWr4]] + ; CHECK-NEXT: $w0 = COPY [[COPY7]] + %0:fpr(s32) = COPY $s0 + %1:gpr(s32) = G_FCMP floatpred(one), %0, %0 + %2:gpr(s8) = G_TRUNC %1(s32) + %3:gpr(s32) = G_ANYEXT %2 + $w0 = COPY %3(s32) - %2(s64) = COPY $d0 - %3(s32) = G_FCMP floatpred(uge), %2, %2 - %5(s8) = G_TRUNC %3(s32) - %7(s32) = G_ANYEXT %5 + %4:fpr(s64) = COPY $d0 + %5:gpr(s32) = G_FCMP floatpred(uge), %4, %4 + %6:gpr(s8) = G_TRUNC %5(s32) + %7:gpr(s32) = G_ANYEXT %6 $w0 = COPY %7(s32) - %8(s32) = COPY $s0 - %9(s32) = G_FCMP floatpred(true), %8, %8 - %12(s8) = G_TRUNC %9(s32) - %14(s32) = G_ANYEXT %12 - $w0 = COPY %14(s32) + %8:fpr(s32) = COPY $s0 + %9:gpr(s32) = G_FCMP floatpred(true), %8, %8 + %10:gpr(s8) = G_TRUNC %9(s32) + %11:gpr(s32) = G_ANYEXT %10 + $w0 = COPY %11(s32) - %10(s64) = COPY $d0 - %11(s32) = G_FCMP floatpred(false), %10, %10 - %13(s8) = G_TRUNC %11(s32) - %15(s32) = G_ANYEXT %13 + %12:fpr(s64) = COPY $d0 + %13:gpr(s32) = G_FCMP floatpred(false), %12, %12 + %14:gpr(s8) = G_TRUNC %13(s32) + %15:gpr(s32) = G_ANYEXT %14 $w0 = COPY %15(s32) ... --- -# CHECK-LABEL: name: phi name: phi legalized: true regBankSelected: true tracksRegLiveness: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: fpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' } -# CHECK-NEXT: - { id: 2, class: fpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 3, class: gpr32, preferred-register: '' } -registers: - - { id: 0, class: fpr } - - { id: 1, class: gpr } - - { id: 2, class: fpr } - -# CHECK: body: -# CHECK: bb.1: -# CHECK: %2:fpr32 = PHI %0, %bb.0, %2, %bb.1 - body: | + ; CHECK-LABEL: name: phi + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $s0, $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $s0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:fpr32 = PHI [[COPY]], %bb.0, [[PHI]], %bb.1 + ; CHECK-NEXT: TBNZW [[COPY1]], 0, %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $s0 = COPY [[PHI]] + ; CHECK-NEXT: RET_ReallyLR implicit $s0 bb.0: liveins: $s0, $w0 successors: %bb.1 - %0(s32) = COPY $s0 + %0:fpr(s32) = COPY $s0 %3:gpr(s32) = COPY $w0 bb.1: successors: %bb.1, %bb.2 - %2(s32) = PHI %0, %bb.0, %2, %bb.1 + %2:fpr(s32) = PHI %0, %bb.0, %2, %bb.1 G_BRCOND %3, %bb.1 bb.2: @@ -270,60 +237,46 @@ body: | ... --- -# CHECK-LABEL: name: select name: select legalized: true regBankSelected: true tracksRegLiveness: true - -# CHECK: registers: -# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' } -# CHECK-NEXT: - { id: 1, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 2, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 3, class: gpr32, preferred-register: '' } -# CHECK-NEXT: - { id: 4, class: gpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 5, class: gpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 6, class: gpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 7, class: gpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 8, class: gpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 9, class: gpr64, preferred-register: '' } -# CHECK-NEXT: - { id: 10, class: gpr32, preferred-register: '' } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } - - { id: 4, class: gpr } - - { id: 5, class: gpr } - - { id: 6, class: gpr } - - { id: 7, class: gpr } - - { id: 8, class: gpr } - - { id: 9, class: gpr } - -# CHECK: body: -# CHECK: ANDSWri %10, 0, implicit-def $nzcv -# CHECK: %3:gpr32 = CSELWr %1, %2, 1, implicit $nzcv -# CHECK: ANDSWri %10, 0, implicit-def $nzcv -# CHECK: %6:gpr64 = CSELXr %4, %5, 1, implicit $nzcv -# CHECK: ANDSWri %10, 0, implicit-def $nzcv -# CHECK: %9:gpr64 = CSELXr %7, %8, 1, implicit $nzcv body: | bb.0: liveins: $w0, $w1, $w2 + ; CHECK-LABEL: name: select + ; CHECK: liveins: $w0, $w1, $w2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr32 = COPY $w2 + ; CHECK-NEXT: [[ANDSWri:%[0-9]+]]:gpr32 = ANDSWri [[COPY]], 0, implicit-def $nzcv + ; CHECK-NEXT: [[CSELWr:%[0-9]+]]:gpr32 = CSELWr [[COPY1]], [[COPY2]], 1, implicit $nzcv + ; CHECK-NEXT: $w0 = COPY [[CSELWr]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[ANDSWri1:%[0-9]+]]:gpr32 = ANDSWri [[COPY]], 0, implicit-def $nzcv + ; CHECK-NEXT: [[CSELXr:%[0-9]+]]:gpr64 = CSELXr [[COPY3]], [[COPY4]], 1, implicit $nzcv + ; CHECK-NEXT: $x0 = COPY [[CSELXr]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64 = COPY $x0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64 = COPY $x1 + ; CHECK-NEXT: [[ANDSWri2:%[0-9]+]]:gpr32 = ANDSWri [[COPY]], 0, implicit-def $nzcv + ; CHECK-NEXT: [[CSELXr1:%[0-9]+]]:gpr64 = CSELXr [[COPY5]], [[COPY6]], 1, implicit $nzcv + ; CHECK-NEXT: $x0 = COPY [[CSELXr1]] %10:gpr(s32) = COPY $w0 - %1(s32) = COPY $w1 - %2(s32) = COPY $w2 - %3(s32) = G_SELECT %10, %1, %2 + %1:gpr(s32) = COPY $w1 + %2:gpr(s32) = COPY $w2 + %3:gpr(s32) = G_SELECT %10, %1, %2 $w0 = COPY %3(s32) - %4(s64) = COPY $x0 - %5(s64) = COPY $x1 - %6(s64) = G_SELECT %10, %4, %5 + %4:gpr(s64) = COPY $x0 + %5:gpr(s64) = COPY $x1 + %6:gpr(s64) = G_SELECT %10, %4, %5 $x0 = COPY %6(s64) - %7(p0) = COPY $x0 - %8(p0) = COPY $x1 - %9(p0) = G_SELECT %10, %7, %8 + %7:gpr(p0) = COPY $x0 + %8:gpr(p0) = COPY $x1 + %9:gpr(p0) = G_SELECT %10, %7, %8 $x0 = COPY %9(p0) ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir index e288d9d5ab3c0..eafd1e15e2cb3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir @@ -16,7 +16,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p1), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s32) = COPY $vgpr3 @@ -40,7 +41,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p0), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32)) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s32) = COPY $vgpr3 @@ -63,7 +65,8 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 3) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p3) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -87,7 +90,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY2]](s64), [[COPY1]](s64) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p1), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s64), addrspace 1) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s64), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s64), implicit [[ICMP]](s1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = COPY $vgpr4_vgpr5 @@ -110,7 +114,8 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr3_vgpr4 ; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic (s64), addrspace 3) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s64), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[ATOMIC_CMPXCHG]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s64), implicit [[ICMP]](s1) %0:_(p3) = COPY $vgpr0 %1:_(s64) = COPY $vgpr1_vgpr2 %2:_(s64) = COPY $vgpr3_vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir index dba20e128237c..eb86a981c9f1e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir @@ -86,8 +86,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -117,8 +118,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -172,11 +174,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND2]](s32), [[AND3]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](<2 x s16>) ; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -360,13 +363,14 @@ body: | ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR3]](s1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s16>) = COPY [[CONCAT_VECTORS]](<4 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]] ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32), [[AND6]](s32), [[AND7]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY5]](<4 x s16>) ; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 @@ -403,11 +407,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]] ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](<2 x s32>) ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir index 93d00714158be..80b3166108ad8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir @@ -955,15 +955,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX8-LABEL: name: saddsat_s64 @@ -980,15 +981,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX9-LABEL: name: saddsat_s64 @@ -1005,15 +1007,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -1043,15 +1046,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1060,13 +1064,14 @@ body: | ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1086,15 +1091,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1103,13 +1109,14 @@ body: | ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1129,15 +1136,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1146,13 +1154,14 @@ body: | ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir index 57b1ab9b194ec..220450c5e4ec6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir @@ -86,8 +86,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -117,8 +118,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -172,11 +174,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND2]](s32), [[AND3]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](<2 x s16>) ; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -360,13 +363,14 @@ body: | ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR3]](s1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s16>) = COPY [[CONCAT_VECTORS]](<4 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]] ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32), [[AND6]](s32), [[AND7]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY5]](<4 x s16>) ; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 @@ -403,11 +407,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]] ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](<2 x s32>) ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir index 33a8cda8e84b3..49fb6e9bdaf35 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -955,15 +955,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX8-LABEL: name: ssubsat_s64 @@ -980,15 +981,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX9-LABEL: name: ssubsat_s64 @@ -1005,15 +1007,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -1043,15 +1046,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1060,13 +1064,14 @@ body: | ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1086,15 +1091,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1103,13 +1109,14 @@ body: | ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1129,15 +1136,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1146,13 +1154,14 @@ body: | ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 6eed92ba1d71c..6d4aa3b04d761 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -670,36 +670,19 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, 2.0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, -2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100001 -; GFX6-NEXT: s_ashr_i32 s2, s0, 31 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: s_xor_b32 s0, s0, s2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e64 v2, s[0:1], 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100001 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_lshr_b32 s4, s4, 31 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_ashr_i32 s3, s3, 1 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %src = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index 1061f0003bd48..2c2f8e914447d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -279,125 +279,27 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x1000 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x1000, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 4096 ret i32 %result } define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { -; GISEL-LABEL: v_sdiv_v2i32_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: v_mov_b32_e32 v3, 0x1000 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v5, 0xfffff000 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v0, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] -; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v3 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[6:7] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_sdiv_v2i32_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 20, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 12, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, ret <2 x i32> %result } @@ -884,3 +786,24 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) { %result = sdiv <2 x i32> %num.mask, %den.mask ret <2 x i32> %result } + +define i32 @v_sdiv_i32_exact(i32 %num) { +; CHECK-LABEL: v_sdiv_i32_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact i32 %num, 4096 + ret i32 %result +} + +define <2 x i32> @v_sdiv_v2i32_exact(<2 x i32> %num) { +; CHECK-LABEL: v_sdiv_v2i32_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 10, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i32> %num, + ret <2 x i32> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 84906c01a4698..377fa24cb4755 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -999,602 +999,45 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_sdiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 -; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: s_setpc_b64 s[30:31] - %result = sdiv i64 %num, 4096 - ret i64 %result -} - -define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { -; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5] -; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v5, 0x1000 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9] -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v13, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v9, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v6, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v12, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v7, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv i64 %num, 4096 + ret i64 %result +} + +define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { +; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_lshrrev_b32_e32 v4, 20, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_lshrrev_b32_e32 v5, 20, v5 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; GISEL-NEXT: v_ashr_i64 v[2:3], v[2:3], 12 ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: v_mov_b32_e32 v6, 0xfffff000 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_lshrrev_b32_e32 v4, 20, v4 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 -; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v18, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v16 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15 -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_lshrrev_b32_e32 v4, 20, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CGP-NEXT: v_ashr_i64 v[2:3], v[2:3], 12 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, ret <2 x i64> %result @@ -3398,3 +2841,24 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { %result = sdiv <2 x i64> %num.mask, %den.mask ret <2 x i64> %result } + +define i64 @v_sdiv_i64_exact(i64 %num) { +; CHECK-LABEL: v_sdiv_i64_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact i64 %num, 4096 + ret i64 %result +} + +define <2 x i64> @v_sdiv_v2i64_exact(<2 x i64> %num) { +; CHECK-LABEL: v_sdiv_v2i64_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CHECK-NEXT: v_ashr_i64 v[2:3], v[2:3], 10 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i64> %num, + ret <2 x i64> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 2f3d5d9d140c2..cf99b5d80e13a 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1,10 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0 %s -; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671 -; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s -; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G %s +; RUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G-O0 %s define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_sdiv_i128_vv: @@ -1223,6 +1222,1158 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_sdiv_i128_vv: +; GFX9-G: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 +; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v17, v6 +; GFX9-G-NEXT: v_subb_co_u32_e32 v19, vcc, v1, v17, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v17, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v17, vcc +; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 +; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 +; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 +; GFX9-G-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v4 +; GFX9-G-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v5 +; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] +; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 +; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 +; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 +; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 +; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13 +; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13] +; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1 +; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3 +; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7] +; GFX9-G-NEXT: v_sub_co_u32_e64 v0, s[6:7], v0, v1 +; GFX9-G-NEXT: v_subb_co_u32_e64 v1, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v6, 0x7f +; GFX9-G-NEXT: v_subb_co_u32_e64 v2, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-NEXT: v_subb_co_u32_e64 v3, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[6:7] +; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v15, v1, v3 +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[6:7] +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v20, v7, v6 +; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0 +; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 +; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 +; GFX9-G-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-G-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, 1, v0 +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v1, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] +; GFX9-G-NEXT: v_subrev_u32_e32 v9, 64, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] +; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] +; GFX9-G-NEXT: v_subrev_u32_e32 v24, 64, v20 +; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] +; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20 +; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18 +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 +; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] +; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while +; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7 +; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2 +; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12 +; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12 +; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v19 +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v2, vcc +; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4 +; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc +; GFX9-G-NEXT: v_and_b32_e32 v0, v28, v5 +; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v0, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20 +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 +; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28 +; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-G-NEXT: ; %bb.4: ; %Flow +; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: .LBB0_5: ; %Flow2 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 +; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 +; GFX9-G-NEXT: .LBB0_6: ; %Flow3 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v16 +; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 +; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_sdiv_i128_vv: +; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v12, v3, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v10, v1, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v11, v3, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v9, v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v12, v1 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v10, v3 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v10, v2 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v12 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[6:7], v4, v12, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[6:7], v3, v10, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v5, s[6:7], v2, v10, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v5, v11, v5 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v8, v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v9, v7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v9, v6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v5, s[6:7], v5, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v15, s[6:7], v8, v11, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v14, s[6:7], v7, v9, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v13, s[6:7], v6, v9, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v13, v11, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v11, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v9, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v9, v9, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s16, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s15, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6 +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-G-O0-NEXT: s_branch .LBB0_8 +; GFX9-G-O0-NEXT: .LBB0_1: ; %Flow +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_5 +; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_9 +; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v1, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_or3_b32 v4, v4, v6, v7 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_3 +; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_4 +; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4] +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13] +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24 +; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 +; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v6, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s11, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20 +; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19 +; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] +; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-G-O0-NEXT: s_branch .LBB0_1 +; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25 +; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_6 +; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-G-O0-NEXT: s_branch .LBB0_7 +; GFX9-G-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v1, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v2, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v3, v5 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v8 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v7, s[4:5] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[4:5], v2, v6, s[4:5] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; GFX9-G-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = sdiv i128 %lhs, %rhs ret i128 %div } @@ -2306,6 +3457,1043 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_udiv_i128_vv: +; GFX9-G: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_or_b32_e32 v8, v4, v6 +; GFX9-G-NEXT: v_or_b32_e32 v9, v5, v7 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v8, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v9, v1, v3 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v4 +; GFX9-G-NEXT: v_ffbh_u32_e32 v8, v5 +; GFX9-G-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v6 +; GFX9-G-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v7 +; GFX9-G-NEXT: v_add_u32_e32 v10, 32, v10 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7] +; GFX9-G-NEXT: v_add_u32_e32 v8, 64, v8 +; GFX9-G-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v1 +; GFX9-G-NEXT: v_add_u32_e32 v10, 32, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v11, v2 +; GFX9-G-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v3 +; GFX9-G-NEXT: v_add_u32_e32 v11, 32, v11 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: v_add_u32_e32 v9, 64, v9 +; GFX9-G-NEXT: v_min_u32_e32 v10, v10, v11 +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v10, v9, s[6:7] +; GFX9-G-NEXT: v_sub_co_u32_e64 v12, s[6:7], v8, v9 +; GFX9-G-NEXT: v_subb_co_u32_e64 v13, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v8, 0x7f +; GFX9-G-NEXT: v_subb_co_u32_e64 v14, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-G-NEXT: v_subb_co_u32_e64 v15, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[12:13], v[8:9] +; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[14:15] +; GFX9-G-NEXT: v_or_b32_e32 v17, v13, v15 +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v18, v9, v8 +; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12 +; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14 +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16 +; GFX9-G-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-G-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 +; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v14, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v15, vcc +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v16, vcc, 0x7f, v12 +; GFX9-G-NEXT: v_sub_u32_e32 v8, 64, v16 +; GFX9-G-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], v16, v[2:3] +; GFX9-G-NEXT: v_subrev_u32_e32 v14, 64, v16 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v16, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-G-NEXT: v_sub_u32_e32 v12, 64, v18 +; GFX9-G-NEXT: v_subrev_u32_e32 v22, 64, v18 +; GFX9-G-NEXT: v_lshrrev_b64 v[10:11], v18, v[0:1] +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b64 v[16:17], v18, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b64 v[2:3], v22, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13 +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4 +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX9-G-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-G-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] +; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while +; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[14:15] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15 +; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[16:17] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v2 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v9 +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 +; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v22, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v23, v11, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v24, v12, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v25, v13, vcc +; GFX9-G-NEXT: v_add_co_u32_e64 v18, s[4:5], -1, v18 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX9-G-NEXT: v_addc_co_u32_e64 v19, s[4:5], -1, v19, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v10, v3, v4 +; GFX9-G-NEXT: v_addc_co_u32_e64 v20, s[4:5], -1, v20, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v16, v3, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10 +; GFX9-G-NEXT: v_addc_co_u32_e64 v21, s[4:5], -1, v21, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX9-G-NEXT: v_and_b32_e32 v17, v3, v6 +; GFX9-G-NEXT: v_and_b32_e32 v26, v3, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v11, v16, vcc +; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20 +; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc +; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-G-NEXT: ; %bb.4: ; %Flow +; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: .LBB1_5: ; %Flow2 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v15 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v2 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v0 +; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v1 +; GFX9-G-NEXT: .LBB1_6: ; %Flow3 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-G-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-G-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_udiv_i128_vv: +; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s13, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s12, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6 +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-G-O0-NEXT: s_branch .LBB1_8 +; GFX9-G-O0-NEXT: .LBB1_1: ; %Flow +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_5 +; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_9 +; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v1, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_or3_b32 v4, v4, v6, v7 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_3 +; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_4 +; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4] +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13] +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24 +; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 +; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v6, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s11, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20 +; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19 +; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] +; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-G-O0-NEXT: s_branch .LBB1_1 +; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25 +; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_6 +; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-G-O0-NEXT: s_branch .LBB1_7 +; GFX9-G-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-G-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = udiv i128 %lhs, %rhs ret i128 %div } @@ -2388,6 +4576,66 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_sdiv_i128_v_pow2k: +; GFX9-G: ; %bb.0: +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-G-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-G-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5] +; GFX9-G-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-G-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v3, 1, v4 +; GFX9-G-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_sdiv_i128_v_pow2k: +; GFX9-G-O0: ; %bb.0: +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v0, v0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v0, v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v5 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v0, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v2, v0, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v3, v0, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v3, v2, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v2, v2, v4 +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = sdiv i128 %lhs, 8589934592 ret i128 %div } @@ -2434,10 +4682,42 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_udiv_i128_v_pow2k: +; GFX9-G: ; %bb.0: +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4 +; GFX9-G-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX9-G-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_udiv_i128_v_pow2k: +; GFX9-G-O0: ; %bb.0: +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v4 +; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = udiv i128 %lhs, 8589934592 ret i128 %div } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX9-SDAG: {{.*}} -; GFX9-SDAG-O0: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 549bb6d6a28b0..47923fddae382 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2479,8 +2479,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000 -; GFX1032-NEXT: s_add_i32 s1, s1, 32 +; GFX1032-NEXT: s_brev_b32 s1, 1 ; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -2494,8 +2493,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 -; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1032-NEXT: s_min_u32 s0, s0, s1 +; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -2529,10 +2527,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 ; GFX1064-NEXT: s_bitset1_b32 s1, 31 -; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1064-NEXT: s_ff1_i32_b32 s1, s1 -; GFX1064-NEXT: s_add_i32 s1, s1, 32 -; GFX1064-NEXT: s_min_u32 s0, s0, s1 +; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] @@ -2576,9 +2571,8 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 ; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0 -; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000 +; GFX1032-NEXT: s_brev_b32 s1, 1 ; GFX1032-NEXT: v_rcp_f32_e32 v2, v1 -; GFX1032-NEXT: s_add_i32 s1, s1, 32 ; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2 @@ -2592,8 +2586,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 ; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1032-NEXT: s_min_u32 s0, s0, s1 +; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -2609,15 +2602,15 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 -; GFX1064-NEXT: v_div_scale_f32 v4, vcc, v0, s2, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 ; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX1064-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX1064-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX1064-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 +; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 ; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 @@ -2625,10 +2618,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 ; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_bitset1_b32 s1, 31 -; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1064-NEXT: s_ff1_i32_b32 s1, s1 -; GFX1064-NEXT: s_add_i32 s1, s1, 32 -; GFX1064-NEXT: s_min_u32 s0, s0, s1 +; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] diff --git a/llvm/test/CodeGen/DirectX/ceil.ll b/llvm/test/CodeGen/DirectX/ceil.ll new file mode 100644 index 0000000000000..1585471467801 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ceil.ll @@ -0,0 +1,20 @@ +; RUN: opt -S -dxil-op-lower < %s | FileCheck %s + +; Make sure dxil operation function calls for ceil are generated for float and half. + +define noundef float @ceil_float(float noundef %a) { +entry: +; CHECK:call float @dx.op.unary.f32(i32 28, float %{{.*}}) + %elt.ceil = call float @llvm.ceil.f32(float %a) + ret float %elt.ceil +} + +define noundef half @ceil_half(half noundef %a) { +entry: +; CHECK:call half @dx.op.unary.f16(i32 28, half %{{.*}}) + %elt.ceil = call half @llvm.ceil.f16(half %a) + ret half %elt.ceil +} + +declare half @llvm.ceil.f16(half) +declare float @llvm.ceil.f32(float) diff --git a/llvm/test/CodeGen/DirectX/ceil_error.ll b/llvm/test/CodeGen/DirectX/ceil_error.ll new file mode 100644 index 0000000000000..1b554d8715566 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ceil_error.ll @@ -0,0 +1,10 @@ +; RUN: not opt -S -dxil-op-lower %s 2>&1 | FileCheck %s + +; DXIL operation ceil does not support double overload type +; CHECK: LLVM ERROR: Invalid Overload Type + +define noundef double @ceil_double(double noundef %a) { +entry: + %elt.ceil = call double @llvm.ceil.f64(double %a) + ret double %elt.ceil +} diff --git a/llvm/test/CodeGen/DirectX/round.ll b/llvm/test/CodeGen/DirectX/round.ll index 5d53a794b763a..e0a3772ebca8f 100644 --- a/llvm/test/CodeGen/DirectX/round.ll +++ b/llvm/test/CodeGen/DirectX/round.ll @@ -1,31 +1,22 @@ ; RUN: opt -S -dxil-op-lower < %s | FileCheck %s ; Make sure dxil operation function calls for round are generated for float and half. -; CHECK:call float @dx.op.unary.f32(i32 26, float %{{.*}}) -; CHECK:call half @dx.op.unary.f16(i32 26, half %{{.*}}) -target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64" -target triple = "dxil-pc-shadermodel6.7-library" - -; Function Attrs: noinline nounwind optnone -define noundef float @round_float(float noundef %a) #0 { +; CHECK-LABEL: round_half +define noundef half @round_half(half noundef %a) { entry: - %a.addr = alloca float, align 4 - store float %a, ptr %a.addr, align 4 - %0 = load float, ptr %a.addr, align 4 - %elt.round = call float @llvm.round.f32(float %0) - ret float %elt.round +; CHECK: call half @dx.op.unary.f16(i32 26, half %{{.*}}) + %elt.roundeven = call half @llvm.roundeven.f16(half %a) + ret half %elt.roundeven } -; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn -declare float @llvm.round.f32(float) #1 - -; Function Attrs: noinline nounwind optnone -define noundef half @round_half(half noundef %a) #0 { +; CHECK-LABEL: round_float +define noundef float @round_float(float noundef %a) { entry: - %a.addr = alloca half, align 2 - store half %a, ptr %a.addr, align 2 - %0 = load half, ptr %a.addr, align 2 - %elt.round = call half @llvm.round.f16(half %0) - ret half %elt.round +; CHECK: call float @dx.op.unary.f32(i32 26, float %{{.*}}) + %elt.roundeven = call float @llvm.roundeven.f32(float %a) + ret float %elt.roundeven } + +declare half @llvm.roundeven.f16(half) +declare float @llvm.roundeven.f32(float) diff --git a/llvm/test/CodeGen/DirectX/round_error.ll b/llvm/test/CodeGen/DirectX/round_error.ll index 3bd87b2bbf020..2d27fbb5ee20d 100644 --- a/llvm/test/CodeGen/DirectX/round_error.ll +++ b/llvm/test/CodeGen/DirectX/round_error.ll @@ -8,6 +8,6 @@ entry: %a.addr = alloca double, align 8 store double %a, ptr %a.addr, align 8 %0 = load double, ptr %a.addr, align 8 - %elt.round = call double @llvm.round.f64(double %0) - ret double %elt.round + %elt.roundeven = call double @llvm.roundeven.f64(double %0) + ret double %elt.roundeven } diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/add.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/add.mir index 52352edbe3392..e471e1047caa2 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/add.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/add.mir @@ -220,10 +220,12 @@ body: | ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3 ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]] ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]] + ; MIPS32-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]] ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]] - ; MIPS32-NEXT: $v0 = COPY [[ADD2]](s32) - ; MIPS32-NEXT: $v1 = COPY [[ADD]](s32) + ; MIPS32-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) + ; MIPS32-NEXT: $v0 = COPY [[COPY5]](s32) + ; MIPS32-NEXT: $v1 = COPY [[COPY4]](s32) ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1 %2:_(s32) = COPY $a0 %3:_(s32) = COPY $a1 @@ -268,6 +270,7 @@ body: | ; MIPS32-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (load (s32) from %fixed-stack.3) ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[COPY]] ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY]] + ; MIPS32-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[COPY1]] ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[LOAD1]] ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]] @@ -275,6 +278,7 @@ body: | ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[C]] ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]] ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND]] + ; MIPS32-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) ; MIPS32-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[COPY2]] ; MIPS32-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[LOAD2]] ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 @@ -283,13 +287,15 @@ body: | ; MIPS32-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD4]](s32), [[C]] ; MIPS32-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[OR]] ; MIPS32-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ICMP3]], [[AND2]] + ; MIPS32-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD4]](s32) ; MIPS32-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[COPY3]] ; MIPS32-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C1]] ; MIPS32-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[AND3]] - ; MIPS32-NEXT: $v0 = COPY [[ADD]](s32) - ; MIPS32-NEXT: $v1 = COPY [[ADD2]](s32) - ; MIPS32-NEXT: $a0 = COPY [[ADD4]](s32) - ; MIPS32-NEXT: $a1 = COPY [[ADD6]](s32) + ; MIPS32-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD6]](s32) + ; MIPS32-NEXT: $v0 = COPY [[COPY4]](s32) + ; MIPS32-NEXT: $v1 = COPY [[COPY5]](s32) + ; MIPS32-NEXT: $a0 = COPY [[COPY6]](s32) + ; MIPS32-NEXT: $a1 = COPY [[COPY7]](s32) ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1 %2:_(s32) = COPY $a0 %3:_(s32) = COPY $a1 @@ -331,10 +337,11 @@ body: | ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $a3 ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]] + ; MIPS32-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]] ; MIPS32-NEXT: G_STORE [[AND]](s32), [[COPY3]](p0) :: (store (s8) into %ir.pcarry_flag) - ; MIPS32-NEXT: G_STORE [[ADD]](s32), [[COPY2]](p0) :: (store (s32) into %ir.padd) + ; MIPS32-NEXT: G_STORE [[COPY4]](s32), [[COPY2]](p0) :: (store (s32) into %ir.padd) ; MIPS32-NEXT: RetRA %0:_(s32) = COPY $a0 %1:_(s32) = COPY $a1 diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir index 4c0b3c6177219..f518e9ec9e589 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctpop.mir @@ -29,8 +29,8 @@ body: | ; MIPS32-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; MIPS32-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]] ; MIPS32-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] ; MIPS32-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] ; MIPS32-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32) ; MIPS32-NEXT: $v0 = COPY [[LSHR3]](s32) ; MIPS32-NEXT: RetRA implicit $v0 @@ -70,8 +70,8 @@ body: | ; MIPS32-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; MIPS32-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]] ; MIPS32-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] ; MIPS32-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] ; MIPS32-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32) ; MIPS32-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32) ; MIPS32-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]] diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir index 3e7bcdc39d5d9..a06bb6da45d23 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir @@ -139,9 +139,11 @@ body: | ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD1]], [[SUB1]] ; MIPS32-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C]] ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[C]] + ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32) ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[C1]], [[C1]] ; MIPS32-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ICMP1]] - ; MIPS32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ADD3]](s32), [[ADD5]](s32) + ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ADD5]](s32) + ; MIPS32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) ; MIPS32-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C1]] ; MIPS32-NEXT: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[C1]] ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[XOR2]], [[XOR3]] diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir index 7ad286b952cb1..674d7b68bfae6 100644 --- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir +++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir @@ -275,8 +275,10 @@ body: | ; MIPS32-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[LOAD]], [[COPY]] ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[MUL2]] - ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; MIPS32-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[UMULH]] ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[UMULH]] + ; MIPS32-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ICMP]], [[ICMP1]] ; MIPS32-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[LOAD2]], [[COPY]] ; MIPS32-NEXT: [[MUL4:%[0-9]+]]:_(s32) = G_MUL [[LOAD1]], [[COPY1]] @@ -285,17 +287,22 @@ body: | ; MIPS32-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[LOAD]], [[COPY1]] ; MIPS32-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]] ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[MUL4]] - ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[MUL5]] + ; MIPS32-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32) + ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[COPY6]], [[MUL5]] ; MIPS32-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD4]](s32), [[MUL5]] + ; MIPS32-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD4]](s32) ; MIPS32-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ICMP2]], [[ICMP3]] - ; MIPS32-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[UMULH1]] + ; MIPS32-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[COPY7]], [[UMULH1]] ; MIPS32-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD6]](s32), [[UMULH1]] + ; MIPS32-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD6]](s32) ; MIPS32-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[ICMP4]] - ; MIPS32-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH2]] + ; MIPS32-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[COPY8]], [[UMULH2]] ; MIPS32-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD8]](s32), [[UMULH2]] + ; MIPS32-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[ADD8]](s32) ; MIPS32-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[ICMP5]] - ; MIPS32-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD8]], [[ADD2]] + ; MIPS32-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[COPY9]], [[ADD2]] ; MIPS32-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD10]](s32), [[ADD2]] + ; MIPS32-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[ADD10]](s32) ; MIPS32-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ICMP6]] ; MIPS32-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[LOAD3]], [[COPY]] ; MIPS32-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[LOAD2]], [[COPY1]] @@ -312,8 +319,8 @@ body: | ; MIPS32-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH5]] ; MIPS32-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[ADD11]] ; MIPS32-NEXT: $v0 = COPY [[MUL]](s32) - ; MIPS32-NEXT: $v1 = COPY [[ADD1]](s32) - ; MIPS32-NEXT: $a0 = COPY [[ADD10]](s32) + ; MIPS32-NEXT: $v1 = COPY [[COPY5]](s32) + ; MIPS32-NEXT: $a0 = COPY [[COPY10]](s32) ; MIPS32-NEXT: $a1 = COPY [[ADD18]](s32) ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1 %2:_(s32) = COPY $a0 @@ -359,23 +366,28 @@ body: | ; MIPS32-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY2]], [[COPY]] ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]] ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[MUL1]] - ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; MIPS32-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[UMULH]] ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[UMULH]] + ; MIPS32-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ICMP]], [[ICMP1]] ; MIPS32-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[COPY3]], [[COPY1]] ; MIPS32-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[COPY3]], [[COPY]] ; MIPS32-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[COPY2]], [[COPY1]] ; MIPS32-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[MUL2]], [[UMULH1]] ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[UMULH1]] - ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]] + ; MIPS32-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32) + ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[COPY6]], [[UMULH2]] ; MIPS32-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD4]](s32), [[UMULH2]] + ; MIPS32-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD4]](s32) ; MIPS32-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ICMP2]], [[ICMP3]] - ; MIPS32-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD2]] + ; MIPS32-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[COPY7]], [[ADD2]] ; MIPS32-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD6]](s32), [[ADD2]] + ; MIPS32-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[ADD6]](s32) ; MIPS32-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[ICMP4]] ; MIPS32-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[COPY3]], [[COPY1]] ; MIPS32-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[UMULH3]], [[ADD7]] - ; MIPS32-NEXT: $v0 = COPY [[ADD6]](s32) + ; MIPS32-NEXT: $v0 = COPY [[COPY8]](s32) ; MIPS32-NEXT: $v1 = COPY [[ADD8]](s32) ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1 %2:_(s32) = COPY $a0 diff --git a/llvm/test/CodeGen/NVPTX/common-linkage.ll b/llvm/test/CodeGen/NVPTX/common-linkage.ll new file mode 100644 index 0000000000000..976074e12ba66 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/common-linkage.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx43 | FileCheck %s --check-prefixes CHECK,PTX43 +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=+ptx50 | FileCheck %s --check-prefixes CHECK,PTX50 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} + +; PTX43: .weak .global .align 4 .u32 g +; PTX50: .common .global .align 4 .u32 g +@g = common addrspace(1) global i32 0, align 4 + +; CHECK: .weak .const .align 4 .u32 c +@c = common addrspace(4) global i32 0, align 4 + +; CHECK: .weak .shared .align 4 .u32 s +@s = common addrspace(3) global i32 0, align 4 + +define i32 @f1() { + %1 = load i32, ptr addrspace(1) @g + ret i32 %1 +} + +define i32 @f4() { + %1 = load i32, ptr addrspace(4) @c + ret i32 %1 +} + +define i32 @f3() { + %1 = load i32, ptr addrspace(3) @s + ret i32 %1 +} diff --git a/llvm/test/CodeGen/NVPTX/weak-global.ll b/llvm/test/CodeGen/NVPTX/weak-global.ll index dd0160d1c0a65..c5467aad08a36 100644 --- a/llvm/test/CodeGen/NVPTX/weak-global.ll +++ b/llvm/test/CodeGen/NVPTX/weak-global.ll @@ -1,7 +1,10 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | FileCheck %s --check-prefix PTX43 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | FileCheck %s --check-prefix PTX50 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx43 | %ptxas-verify %} +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=+ptx50 | %ptxas-verify %} -; CHECK: .weak .global .align 4 .u32 g +; PTX43: .weak .global .align 4 .u32 g +; PTX50: .common .global .align 4 .u32 g @g = common addrspace(1) global i32 zeroinitializer define i32 @func0() { diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv32.mir index d169eb316dfcb..b3c62df4ffdca 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv32.mir @@ -89,10 +89,12 @@ body: | ; CHECK-NEXT: %yhi:_(s32) = COPY $x13 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD %xlo, %ylo ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), %ylo + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD %xhi, %yhi ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s32) - ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) + ; CHECK-NEXT: $x10 = COPY [[COPY]](s32) + ; CHECK-NEXT: $x11 = COPY [[COPY1]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %xlo:_(s32) = COPY $x10 %xhi:_(s32) = COPY $x11 @@ -121,10 +123,12 @@ body: | ; CHECK-NEXT: %hi2:_(s32) = COPY $x13 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD %lo1, %lo2 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), %lo2 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD %hi1, %hi2 ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s32) - ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) + ; CHECK-NEXT: $x10 = COPY [[COPY]](s32) + ; CHECK-NEXT: $x11 = COPY [[COPY1]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %lo1:_(s32) = COPY $x10 %hi1:_(s32) = COPY $x11 @@ -152,6 +156,7 @@ body: | ; CHECK-NEXT: %hi2:_(s32) = COPY $x15 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD %lo1, %lo2 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), %lo2 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD %mid1, %mid2 ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), %mid1 ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]] @@ -159,11 +164,13 @@ body: | ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[C]] ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]] ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD %hi1, %hi2 ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[OR]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s32) - ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32) - ; CHECK-NEXT: $x12 = COPY [[ADD4]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD4]](s32) + ; CHECK-NEXT: $x10 = COPY [[COPY]](s32) + ; CHECK-NEXT: $x11 = COPY [[COPY1]](s32) + ; CHECK-NEXT: $x12 = COPY [[COPY2]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12 %lo1:_(s32) = COPY $x10 %mid1:_(s32) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir index f394e4d5064ed..6e76bb0e3eff5 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-add-rv64.mir @@ -121,10 +121,12 @@ body: | ; CHECK-NEXT: %y01:_(s64) = COPY $x13 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %x00, %y00 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), %y00 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[ADD]](s64) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD %x01, %y01 ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[ICMP]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s64) - ; CHECK-NEXT: $x11 = COPY [[ADD2]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[ADD2]](s64) + ; CHECK-NEXT: $x10 = COPY [[COPY]](s64) + ; CHECK-NEXT: $x11 = COPY [[COPY1]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %x00:_(s64) = COPY $x10 %x01:_(s64) = COPY $x11 @@ -153,10 +155,12 @@ body: | ; CHECK-NEXT: %hi2:_(s64) = COPY $x13 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %lo1, %lo2 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), %lo2 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[ADD]](s64) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD %hi1, %hi2 ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[ICMP]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s64) - ; CHECK-NEXT: $x11 = COPY [[ADD2]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[ADD2]](s64) + ; CHECK-NEXT: $x10 = COPY [[COPY]](s64) + ; CHECK-NEXT: $x11 = COPY [[COPY1]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %lo1:_(s64) = COPY $x10 %hi1:_(s64) = COPY $x11 @@ -184,6 +188,7 @@ body: | ; CHECK-NEXT: %hi2:_(s64) = COPY $x15 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %lo1, %lo2 ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), %lo2 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[ADD]](s64) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD %mid1, %mid2 ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD1]](s64), %mid1 ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[ICMP]] @@ -194,14 +199,16 @@ body: | ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]] ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64) ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC2]], [[AND]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[ADD2]](s64) ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s64) = G_ADD %hi1, %hi2 ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C1]] ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s64) = G_ADD [[ADD3]], [[AND1]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s64) - ; CHECK-NEXT: $x11 = COPY [[ADD2]](s64) - ; CHECK-NEXT: $x12 = COPY [[ADD4]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[ADD4]](s64) + ; CHECK-NEXT: $x10 = COPY [[COPY]](s64) + ; CHECK-NEXT: $x11 = COPY [[COPY1]](s64) + ; CHECK-NEXT: $x12 = COPY [[COPY2]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12 %lo1:_(s64) = COPY $x10 %mid1:_(s64) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir index c348ec6f73ad3..9227e6530221c 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv32.mir @@ -92,7 +92,8 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ADD]](s32), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP1]], [[ICMP]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: $x10 = COPY [[COPY2]](s32) ; CHECK-NEXT: $x11 = COPY [[XOR]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(s32) = COPY $x10 @@ -119,21 +120,23 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]] ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ADD2]](s32), [[COPY1]] - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[COPY1]] - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY]] + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY5]](s32), [[COPY1]] + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY1]] + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY]] ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C1]] ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]] ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]] ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s32) - ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32) + ; CHECK-NEXT: $x10 = COPY [[COPY4]](s32) + ; CHECK-NEXT: $x11 = COPY [[COPY5]](s32) ; CHECK-NEXT: $x12 = COPY [[XOR]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12 %2:_(s32) = COPY $x10 @@ -241,7 +244,8 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP1]], [[ICMP]] - ; CHECK-NEXT: $x10 = COPY [[SUB]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) + ; CHECK-NEXT: $x10 = COPY [[COPY2]](s32) ; CHECK-NEXT: $x11 = COPY [[XOR]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(s32) = COPY $x10 @@ -377,7 +381,8 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: $x10 = COPY [[COPY2]](s32) ; CHECK-NEXT: $x11 = COPY [[ICMP]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(s32) = COPY $x10 @@ -404,14 +409,16 @@ body: | ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]] ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]] - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD2]](s32), [[COPY3]] - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[COPY3]] - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32) + ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY5]](s32), [[COPY3]] + ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY3]] + ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY2]] ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s32) - ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32) + ; CHECK-NEXT: $x10 = COPY [[COPY4]](s32) + ; CHECK-NEXT: $x11 = COPY [[COPY5]](s32) ; CHECK-NEXT: $x12 = COPY [[SELECT]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12 %2:_(s32) = COPY $x10 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir index 5506f5228e9db..8acaff5dbb25a 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-addo-subo-rv64.mir @@ -125,8 +125,9 @@ body: | ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64) ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64) ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[ADD]](s64) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32) - ; CHECK-NEXT: $x10 = COPY [[ADD]](s64) + ; CHECK-NEXT: $x10 = COPY [[COPY2]](s64) ; CHECK-NEXT: $x11 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(s64) = COPY $x10 @@ -261,8 +262,9 @@ body: | ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64) ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64) ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SUB]](s64) ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32) - ; CHECK-NEXT: $x10 = COPY [[SUB]](s64) + ; CHECK-NEXT: $x10 = COPY [[COPY2]](s64) ; CHECK-NEXT: $x11 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(s64) = COPY $x10 @@ -364,7 +366,8 @@ body: | ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ZEXT]](s64), [[AND]] - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ADD]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY2]](s32) ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64) ; CHECK-NEXT: $x11 = COPY [[ICMP]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 @@ -393,7 +396,8 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), [[COPY1]] - ; CHECK-NEXT: $x10 = COPY [[ADD]](s64) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[ADD]](s64) + ; CHECK-NEXT: $x10 = COPY [[COPY2]](s64) ; CHECK-NEXT: $x11 = COPY [[ICMP]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %0:_(s64) = COPY $x10 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv32.mir index a890a411544e7..354fc109a4638 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv32.mir @@ -50,8 +50,8 @@ body: | ; RV32I-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 ; RV32I-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C13]] ; RV32I-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND8]], [[C14]] ; RV32I-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND8]], [[C14]] ; RV32I-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C15]](s32) ; RV32I-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; RV32I-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C16]], [[LSHR6]] @@ -129,8 +129,8 @@ body: | ; RV32I-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3855 ; RV32I-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C15]] ; RV32I-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 257 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND9]], [[C16]] ; RV32I-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND9]], [[C16]] ; RV32I-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; RV32I-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C18]] ; RV32I-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[C17]](s32) @@ -201,8 +201,8 @@ body: | ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C10]] ; RV32I-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C11]] ; RV32I-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C11]] ; RV32I-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C12]](s32) ; RV32I-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; RV32I-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C13]], [[LSHR8]] @@ -267,8 +267,8 @@ body: | ; RV32I-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C11]] ; RV32I-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C12]] ; RV32I-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C12]] ; RV32I-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C13]](s32) ; RV32I-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; RV32I-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C14]], [[LSHR8]] @@ -306,8 +306,8 @@ body: | ; RV32I-NEXT: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ADD4]], [[C26]] ; RV32I-NEXT: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C27]] ; RV32I-NEXT: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C27]] ; RV32I-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C28]](s32) ; RV32I-NEXT: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; RV32I-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C29]], [[LSHR17]] @@ -389,8 +389,8 @@ body: | ; RV32I-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 ; RV32I-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C13]] ; RV32I-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND8]], [[C14]] ; RV32I-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND8]], [[C14]] ; RV32I-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C15]](s32) ; RV32I-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; RV32I-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C16]], [[LSHR6]] @@ -468,8 +468,8 @@ body: | ; RV32I-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3855 ; RV32I-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C15]] ; RV32I-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 257 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND9]], [[C16]] ; RV32I-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND9]], [[C16]] ; RV32I-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; RV32I-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C18]] ; RV32I-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[C17]](s32) @@ -540,8 +540,8 @@ body: | ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C10]] ; RV32I-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C11]] ; RV32I-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C11]] ; RV32I-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C12]](s32) ; RV32I-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; RV32I-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C13]], [[LSHR8]] @@ -606,8 +606,8 @@ body: | ; RV32I-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C11]] ; RV32I-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C12]] ; RV32I-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C12]] ; RV32I-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C13]](s32) ; RV32I-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; RV32I-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C14]], [[LSHR8]] @@ -645,8 +645,8 @@ body: | ; RV32I-NEXT: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ADD4]], [[C26]] ; RV32I-NEXT: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C27]] ; RV32I-NEXT: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C27]] ; RV32I-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C28]](s32) ; RV32I-NEXT: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; RV32I-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C29]], [[LSHR17]] diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir index add8a565202df..38a4b9c6dae38 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctlz-rv64.mir @@ -283,8 +283,8 @@ body: | ; RV64I-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 1085102592571150095 ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C11]] ; RV64I-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673 - ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C12]] ; RV64I-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C12]] ; RV64I-NEXT: [[LSHR9:%[0-9]+]]:_(s64) = G_LSHR [[MUL]], [[C13]](s64) ; RV64I-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 ; RV64I-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C14]], [[LSHR9]] @@ -583,8 +583,8 @@ body: | ; RV64I-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 1085102592571150095 ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C11]] ; RV64I-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673 - ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C12]] ; RV64I-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C12]] ; RV64I-NEXT: [[LSHR9:%[0-9]+]]:_(s64) = G_LSHR [[MUL]], [[C13]](s64) ; RV64I-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 ; RV64I-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C14]], [[LSHR9]] diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv32.mir index d4eb5ebc2e294..c64669cb7341e 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv32.mir @@ -35,8 +35,8 @@ body: | ; RV32I-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 ; RV32I-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C7]] ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]] ; RV32I-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C9]](s32) ; RV32I-NEXT: $x10 = COPY [[LSHR3]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 @@ -90,8 +90,8 @@ body: | ; RV32I-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 3855 ; RV32I-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C7]] ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 257 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]] ; RV32I-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[C8]] ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; RV32I-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C10]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[C9]](s32) @@ -143,8 +143,8 @@ body: | ; RV32I-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]] ; RV32I-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] ; RV32I-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32) ; RV32I-NEXT: $x10 = COPY [[LSHR3]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 @@ -190,8 +190,8 @@ body: | ; RV32I-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]] ; RV32I-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] ; RV32I-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[C6]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C7]](s32) ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; RV32I-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C8]](s32) @@ -210,8 +210,8 @@ body: | ; RV32I-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ADD3]], [[C13]] ; RV32I-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C14]] ; RV32I-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[C14]] ; RV32I-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C15]](s32) ; RV32I-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[LSHR7]], [[LSHR3]] ; RV32I-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir index e2434ba9301c0..196b367e59271 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-ctpop-rv64.mir @@ -205,8 +205,8 @@ body: | ; RV64I-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1085102592571150095 ; RV64I-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ADD1]], [[C5]] ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673 - ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C6]] ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND3]], [[C6]] ; RV64I-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[MUL]], [[C7]](s64) ; RV64I-NEXT: $x10 = COPY [[LSHR3]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv32.mir index 19555a702b73c..372becaf08d94 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv32.mir @@ -39,8 +39,8 @@ body: | ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 ; RV32I-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C8]] ; RV32I-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND6]], [[C9]] ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND6]], [[C9]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C10]](s32) ; RV32I-NEXT: $x10 = COPY [[LSHR3]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 @@ -98,8 +98,8 @@ body: | ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 3855 ; RV32I-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C8]] ; RV32I-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 257 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND6]], [[C9]] ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND6]], [[C9]] ; RV32I-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; RV32I-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C11]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[C10]](s32) @@ -155,8 +155,8 @@ body: | ; RV32I-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C6]] ; RV32I-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C7]] ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C7]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C8]](s32) ; RV32I-NEXT: $x10 = COPY [[LSHR3]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 @@ -208,8 +208,8 @@ body: | ; RV32I-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C7]] ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C8]] ; RV32I-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C8]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C9]](s32) ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; RV32I-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[C10]] @@ -234,8 +234,8 @@ body: | ; RV32I-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ADD6]], [[C17]] ; RV32I-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND9]], [[C18]] ; RV32I-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND9]], [[C18]] ; RV32I-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C19]](s32) ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD3]], [[LSHR7]] ; RV32I-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 @@ -304,8 +304,8 @@ body: | ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 ; RV32I-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C8]] ; RV32I-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND6]], [[C9]] ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND6]], [[C9]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C10]](s32) ; RV32I-NEXT: $x10 = COPY [[LSHR3]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 @@ -363,8 +363,8 @@ body: | ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 3855 ; RV32I-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C8]] ; RV32I-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 257 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND6]], [[C9]] ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND6]], [[C9]] ; RV32I-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; RV32I-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C11]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[C10]](s32) @@ -420,8 +420,8 @@ body: | ; RV32I-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C6]] ; RV32I-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C7]] ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C7]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C8]](s32) ; RV32I-NEXT: $x10 = COPY [[LSHR3]](s32) ; RV32I-NEXT: PseudoRET implicit $x10 @@ -473,8 +473,8 @@ body: | ; RV32I-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C7]] ; RV32I-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C8]] ; RV32I-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND4]], [[C8]] ; RV32I-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C9]](s32) ; RV32I-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; RV32I-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[C10]] @@ -499,8 +499,8 @@ body: | ; RV32I-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 252645135 ; RV32I-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ADD6]], [[C17]] ; RV32I-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 16843009 - ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND9]], [[C18]] ; RV32I-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; RV32I-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND9]], [[C18]] ; RV32I-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C19]](s32) ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD3]], [[LSHR7]] ; RV32I-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir index e030e3ce2a803..e51a2143efd02 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-cttz-rv64.mir @@ -221,8 +221,8 @@ body: | ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1085102592571150095 ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ADD2]], [[C6]] ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673 - ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND4]], [[C7]] ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND4]], [[C7]] ; RV64I-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[MUL]], [[C8]](s64) ; RV64I-NEXT: $x10 = COPY [[LSHR3]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 @@ -457,8 +457,8 @@ body: | ; RV64I-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1085102592571150095 ; RV64I-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ADD2]], [[C6]] ; RV64I-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673 - ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND4]], [[C7]] ; RV64I-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; RV64I-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[AND4]], [[C7]] ; RV64I-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[MUL]], [[C8]](s64) ; RV64I-NEXT: $x10 = COPY [[LSHR3]](s64) ; RV64I-NEXT: PseudoRET implicit $x10 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-mul-ext-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-mul-ext-rv32.mir index 433d6e6b821f3..ec2dc568a5ec3 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-mul-ext-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-mul-ext-rv32.mir @@ -162,8 +162,10 @@ body: | ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH %lo1, %lo2 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[MUL2]] - ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]] + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMULH]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[UMULH]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ADD1]](s32) ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL %hi1, %lo2 ; CHECK-NEXT: [[MUL4:%[0-9]+]]:_(s32) = G_MUL %mid1, %mid2 @@ -171,13 +173,18 @@ body: | ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH %mid1, %lo2 ; CHECK-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH %lo1, %mid2 ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]] - ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[MUL5]] - ; CHECK-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[UMULH1]] - ; CHECK-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[UMULH2]] - ; CHECK-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[ADD2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32) + ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[MUL5]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ADD4]](s32) + ; CHECK-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[UMULH1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD5]](s32) + ; CHECK-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[COPY4]], [[UMULH2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD6]](s32) + ; CHECK-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[COPY5]], [[ADD2]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD7]](s32) ; CHECK-NEXT: $x10 = COPY [[MUL]](s32) - ; CHECK-NEXT: $x11 = COPY [[ADD1]](s32) - ; CHECK-NEXT: $x12 = COPY [[ADD7]](s32) + ; CHECK-NEXT: $x11 = COPY [[COPY1]](s32) + ; CHECK-NEXT: $x12 = COPY [[COPY6]](s32) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12 %lo1:_(s32) = COPY $x10 %mid1:_(s32) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-mul-ext-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-mul-ext-rv64.mir index 09e002e8428d7..39d9c5b7dfd1e 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-mul-ext-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-mul-ext-rv64.mir @@ -194,8 +194,10 @@ body: | ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s64) = G_UMULH %lo1, %lo2 ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[MUL1]], [[MUL2]] ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), [[MUL2]] - ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[UMULH]] + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[ADD]](s64) + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[UMULH]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD1]](s64), [[UMULH]] + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[ADD1]](s64) ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ICMP]], [[ICMP1]] ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL %hi1, %lo2 ; CHECK-NEXT: [[MUL4:%[0-9]+]]:_(s64) = G_MUL %mid1, %mid2 @@ -203,13 +205,18 @@ body: | ; CHECK-NEXT: [[UMULH1:%[0-9]+]]:_(s64) = G_UMULH %mid1, %lo2 ; CHECK-NEXT: [[UMULH2:%[0-9]+]]:_(s64) = G_UMULH %lo1, %mid2 ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s64) = G_ADD [[MUL3]], [[MUL4]] - ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s64) = G_ADD [[ADD3]], [[MUL5]] - ; CHECK-NEXT: [[ADD5:%[0-9]+]]:_(s64) = G_ADD [[ADD4]], [[UMULH1]] - ; CHECK-NEXT: [[ADD6:%[0-9]+]]:_(s64) = G_ADD [[ADD5]], [[UMULH2]] - ; CHECK-NEXT: [[ADD7:%[0-9]+]]:_(s64) = G_ADD [[ADD6]], [[ADD2]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[ADD3]](s64) + ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s64) = G_ADD [[COPY2]], [[MUL5]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[ADD4]](s64) + ; CHECK-NEXT: [[ADD5:%[0-9]+]]:_(s64) = G_ADD [[COPY3]], [[UMULH1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY [[ADD5]](s64) + ; CHECK-NEXT: [[ADD6:%[0-9]+]]:_(s64) = G_ADD [[COPY4]], [[UMULH2]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s64) = COPY [[ADD6]](s64) + ; CHECK-NEXT: [[ADD7:%[0-9]+]]:_(s64) = G_ADD [[COPY5]], [[ADD2]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s64) = COPY [[ADD7]](s64) ; CHECK-NEXT: $x10 = COPY [[MUL]](s64) - ; CHECK-NEXT: $x11 = COPY [[ADD1]](s64) - ; CHECK-NEXT: $x12 = COPY [[ADD7]](s64) + ; CHECK-NEXT: $x11 = COPY [[COPY1]](s64) + ; CHECK-NEXT: $x12 = COPY [[COPY6]](s64) ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11, implicit $x12 %lo1:_(s64) = COPY $x10 %mid1:_(s64) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll index 455e6e54c9b39..549d531e829ea 100644 --- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll @@ -1160,8 +1160,6 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: beqz a0, .LBB10_2 ; RV32I-NEXT: # %bb.1: # %cond.false -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 @@ -1189,12 +1187,11 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 ; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB10_2: ; RV32I-NEXT: li a0, 32 @@ -1205,8 +1202,6 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV64I-NEXT: sext.w a1, a0 ; RV64I-NEXT: beqz a1, .LBB10_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 @@ -1232,14 +1227,13 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB10_2: ; RV64I-NEXT: li a0, 32 @@ -1354,19 +1348,16 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: test_ctlz_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a4, a2, 1365 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a3, a2, 819 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: bnez a1, .LBB11_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -1377,28 +1368,26 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s6, a1, -241 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: srli a0, s2, 1 -; RV32I-NEXT: or a0, s2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: addi a0, a0, 32 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB11_2: +; RV32I-NEXT: srli a0, a1, 1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -1409,43 +1398,27 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: bnez s0, .LBB11_2 -; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: j .LBB11_3 -; RV32I-NEXT: .LBB11_2: -; RV32I-NEXT: srli a0, s1, 24 -; RV32I-NEXT: .LBB11_3: ; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 0(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_ctlz_i64: ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB11_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 2 @@ -1481,14 +1454,13 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB11_2: ; RV64I-NEXT: li a0, 64 @@ -1831,8 +1803,6 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind { define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV32I-LABEL: test_ctlz_i32_zero_undef: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 @@ -1860,18 +1830,15 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 ; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_ctlz_i32_zero_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 @@ -1897,14 +1864,13 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32M-LABEL: test_ctlz_i32_zero_undef: @@ -2005,19 +1971,16 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind { define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32I-LABEL: test_ctlz_i64_zero_undef: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a4, a2, 1365 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a3, a2, 819 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: bnez a1, .LBB15_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -2028,28 +1991,26 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s6, a1, -241 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: srli a0, s2, 1 -; RV32I-NEXT: or a0, s2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: addi a0, a0, 32 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB15_2: +; RV32I-NEXT: srli a0, a1, 1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -2060,41 +2021,25 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: bnez s0, .LBB15_2 -; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: j .LBB15_3 -; RV32I-NEXT: .LBB15_2: -; RV32I-NEXT: srli a0, s1, 24 -; RV32I-NEXT: .LBB15_3: ; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 0(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_ctlz_i64_zero_undef: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 2 @@ -2130,14 +2075,13 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32M-LABEL: test_ctlz_i64_zero_undef: @@ -2464,8 +2408,6 @@ define i16 @test_ctpop_i16(i16 %a) nounwind { define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV32I-LABEL: test_ctpop_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: addi a2, a2, 1365 @@ -2482,18 +2424,15 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 ; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_ctpop_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 @@ -2508,14 +2447,13 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32M-LABEL: test_ctpop_i32: @@ -2578,8 +2516,6 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; ; RV32XTHEADBB-LABEL: test_ctpop_i32: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: addi sp, sp, -16 -; RV32XTHEADBB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32XTHEADBB-NEXT: srli a1, a0, 1 ; RV32XTHEADBB-NEXT: lui a2, 349525 ; RV32XTHEADBB-NEXT: addi a2, a2, 1365 @@ -2596,18 +2532,15 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV32XTHEADBB-NEXT: lui a1, 61681 ; RV32XTHEADBB-NEXT: addi a1, a1, -241 ; RV32XTHEADBB-NEXT: and a0, a0, a1 -; RV32XTHEADBB-NEXT: lui a1, 4112 -; RV32XTHEADBB-NEXT: addi a1, a1, 257 -; RV32XTHEADBB-NEXT: call __mulsi3 +; RV32XTHEADBB-NEXT: slli a1, a0, 8 +; RV32XTHEADBB-NEXT: add a0, a0, a1 +; RV32XTHEADBB-NEXT: slli a1, a0, 16 +; RV32XTHEADBB-NEXT: add a0, a0, a1 ; RV32XTHEADBB-NEXT: srli a0, a0, 24 -; RV32XTHEADBB-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32XTHEADBB-NEXT: addi sp, sp, 16 ; RV32XTHEADBB-NEXT: ret ; ; RV64XTHEADBB-LABEL: test_ctpop_i32: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: addi sp, sp, -16 -; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64XTHEADBB-NEXT: srli a1, a0, 1 ; RV64XTHEADBB-NEXT: lui a2, 349525 ; RV64XTHEADBB-NEXT: addiw a2, a2, 1365 @@ -2622,14 +2555,13 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { ; RV64XTHEADBB-NEXT: srli a1, a0, 4 ; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: lui a1, 61681 -; RV64XTHEADBB-NEXT: addiw a1, a1, -241 +; RV64XTHEADBB-NEXT: addi a1, a1, -241 ; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 4112 -; RV64XTHEADBB-NEXT: addiw a1, a1, 257 -; RV64XTHEADBB-NEXT: call __muldi3 +; RV64XTHEADBB-NEXT: slli a1, a0, 8 +; RV64XTHEADBB-NEXT: add a0, a0, a1 +; RV64XTHEADBB-NEXT: slli a1, a0, 16 +; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: srliw a0, a0, 24 -; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64XTHEADBB-NEXT: addi sp, sp, 16 ; RV64XTHEADBB-NEXT: ret %1 = call i32 @llvm.ctpop.i32(i32 %a) ret i32 %1 @@ -2638,65 +2570,48 @@ define i32 @test_ctpop_i32(i32 %a) nounwind { define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV32I-LABEL: test_ctpop_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s2, a2, 1365 -; RV32I-NEXT: and a0, a0, s2 -; RV32I-NEXT: sub a1, a1, a0 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi s3, a0, 819 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: srli a2, a1, 1 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi a3, a3, 1365 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 +; RV32I-NEXT: and a4, a1, a2 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a1, a1, s3 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s4, a1, -241 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: srli s5, a0, 24 -; RV32I-NEXT: srli a0, s0, 1 -; RV32I-NEXT: and a0, a0, s2 -; RV32I-NEXT: sub s0, s0, a0 -; RV32I-NEXT: and a0, s0, s3 -; RV32I-NEXT: srli s0, s0, 2 -; RV32I-NEXT: and a1, s0, s3 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: srli a4, a1, 4 +; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: lui a4, 61681 +; RV32I-NEXT: addi a4, a4, -241 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: slli a5, a1, 8 +; RV32I-NEXT: add a1, a1, a5 +; RV32I-NEXT: slli a5, a1, 16 +; RV32I-NEXT: add a1, a1, a5 +; RV32I-NEXT: srli a1, a1, 24 +; RV32I-NEXT: srli a5, a0, 1 +; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: sub a0, a0, a3 +; RV32I-NEXT: and a3, a0, a2 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: srli a2, a0, 4 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: add a0, a0, s5 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_ctpop_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 @@ -2719,14 +2634,13 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV32M-LABEL: test_ctpop_i64: @@ -2814,65 +2728,48 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; ; RV32XTHEADBB-LABEL: test_ctpop_i64: ; RV32XTHEADBB: # %bb.0: -; RV32XTHEADBB-NEXT: addi sp, sp, -32 -; RV32XTHEADBB-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32XTHEADBB-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32XTHEADBB-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32XTHEADBB-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32XTHEADBB-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32XTHEADBB-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32XTHEADBB-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32XTHEADBB-NEXT: mv s0, a0 -; RV32XTHEADBB-NEXT: srli a0, a1, 1 -; RV32XTHEADBB-NEXT: lui a2, 349525 -; RV32XTHEADBB-NEXT: addi s2, a2, 1365 -; RV32XTHEADBB-NEXT: and a0, a0, s2 -; RV32XTHEADBB-NEXT: sub a1, a1, a0 -; RV32XTHEADBB-NEXT: lui a0, 209715 -; RV32XTHEADBB-NEXT: addi s3, a0, 819 -; RV32XTHEADBB-NEXT: and a0, a1, s3 +; RV32XTHEADBB-NEXT: srli a2, a1, 1 +; RV32XTHEADBB-NEXT: lui a3, 349525 +; RV32XTHEADBB-NEXT: addi a3, a3, 1365 +; RV32XTHEADBB-NEXT: and a2, a2, a3 +; RV32XTHEADBB-NEXT: sub a1, a1, a2 +; RV32XTHEADBB-NEXT: lui a2, 209715 +; RV32XTHEADBB-NEXT: addi a2, a2, 819 +; RV32XTHEADBB-NEXT: and a4, a1, a2 ; RV32XTHEADBB-NEXT: srli a1, a1, 2 -; RV32XTHEADBB-NEXT: and a1, a1, s3 -; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: srli a1, a0, 4 -; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: lui a1, 61681 -; RV32XTHEADBB-NEXT: addi s4, a1, -241 -; RV32XTHEADBB-NEXT: and a0, a0, s4 -; RV32XTHEADBB-NEXT: lui a1, 4112 -; RV32XTHEADBB-NEXT: addi s1, a1, 257 -; RV32XTHEADBB-NEXT: mv a1, s1 -; RV32XTHEADBB-NEXT: call __mulsi3 -; RV32XTHEADBB-NEXT: srli s5, a0, 24 -; RV32XTHEADBB-NEXT: srli a0, s0, 1 -; RV32XTHEADBB-NEXT: and a0, a0, s2 -; RV32XTHEADBB-NEXT: sub s0, s0, a0 -; RV32XTHEADBB-NEXT: and a0, s0, s3 -; RV32XTHEADBB-NEXT: srli s0, s0, 2 -; RV32XTHEADBB-NEXT: and a1, s0, s3 -; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: srli a1, a0, 4 -; RV32XTHEADBB-NEXT: add a0, a0, a1 -; RV32XTHEADBB-NEXT: and a0, a0, s4 -; RV32XTHEADBB-NEXT: mv a1, s1 -; RV32XTHEADBB-NEXT: call __mulsi3 +; RV32XTHEADBB-NEXT: and a1, a1, a2 +; RV32XTHEADBB-NEXT: add a1, a4, a1 +; RV32XTHEADBB-NEXT: srli a4, a1, 4 +; RV32XTHEADBB-NEXT: add a1, a1, a4 +; RV32XTHEADBB-NEXT: lui a4, 61681 +; RV32XTHEADBB-NEXT: addi a4, a4, -241 +; RV32XTHEADBB-NEXT: and a1, a1, a4 +; RV32XTHEADBB-NEXT: slli a5, a1, 8 +; RV32XTHEADBB-NEXT: add a1, a1, a5 +; RV32XTHEADBB-NEXT: slli a5, a1, 16 +; RV32XTHEADBB-NEXT: add a1, a1, a5 +; RV32XTHEADBB-NEXT: srli a1, a1, 24 +; RV32XTHEADBB-NEXT: srli a5, a0, 1 +; RV32XTHEADBB-NEXT: and a3, a5, a3 +; RV32XTHEADBB-NEXT: sub a0, a0, a3 +; RV32XTHEADBB-NEXT: and a3, a0, a2 +; RV32XTHEADBB-NEXT: srli a0, a0, 2 +; RV32XTHEADBB-NEXT: and a0, a0, a2 +; RV32XTHEADBB-NEXT: add a0, a3, a0 +; RV32XTHEADBB-NEXT: srli a2, a0, 4 +; RV32XTHEADBB-NEXT: add a0, a0, a2 +; RV32XTHEADBB-NEXT: and a0, a0, a4 +; RV32XTHEADBB-NEXT: slli a2, a0, 8 +; RV32XTHEADBB-NEXT: add a0, a0, a2 +; RV32XTHEADBB-NEXT: slli a2, a0, 16 +; RV32XTHEADBB-NEXT: add a0, a0, a2 ; RV32XTHEADBB-NEXT: srli a0, a0, 24 -; RV32XTHEADBB-NEXT: add a0, a0, s5 +; RV32XTHEADBB-NEXT: add a0, a0, a1 ; RV32XTHEADBB-NEXT: li a1, 0 -; RV32XTHEADBB-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32XTHEADBB-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32XTHEADBB-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32XTHEADBB-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32XTHEADBB-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32XTHEADBB-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32XTHEADBB-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32XTHEADBB-NEXT: addi sp, sp, 32 ; RV32XTHEADBB-NEXT: ret ; ; RV64XTHEADBB-LABEL: test_ctpop_i64: ; RV64XTHEADBB: # %bb.0: -; RV64XTHEADBB-NEXT: addi sp, sp, -16 -; RV64XTHEADBB-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64XTHEADBB-NEXT: srli a1, a0, 1 ; RV64XTHEADBB-NEXT: lui a2, 349525 ; RV64XTHEADBB-NEXT: addiw a2, a2, 1365 @@ -2895,14 +2792,13 @@ define i64 @test_ctpop_i64(i64 %a) nounwind { ; RV64XTHEADBB-NEXT: slli a2, a1, 32 ; RV64XTHEADBB-NEXT: add a1, a1, a2 ; RV64XTHEADBB-NEXT: and a0, a0, a1 -; RV64XTHEADBB-NEXT: lui a1, 4112 -; RV64XTHEADBB-NEXT: addiw a1, a1, 257 -; RV64XTHEADBB-NEXT: slli a2, a1, 32 -; RV64XTHEADBB-NEXT: add a1, a1, a2 -; RV64XTHEADBB-NEXT: call __muldi3 +; RV64XTHEADBB-NEXT: slli a1, a0, 8 +; RV64XTHEADBB-NEXT: add a0, a0, a1 +; RV64XTHEADBB-NEXT: slli a1, a0, 16 +; RV64XTHEADBB-NEXT: add a0, a0, a1 +; RV64XTHEADBB-NEXT: slli a1, a0, 32 +; RV64XTHEADBB-NEXT: add a0, a0, a1 ; RV64XTHEADBB-NEXT: srli a0, a0, 56 -; RV64XTHEADBB-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64XTHEADBB-NEXT: addi sp, sp, 16 ; RV64XTHEADBB-NEXT: ret %1 = call i64 @llvm.ctpop.i64(i64 %a) ret i64 %1 diff --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll index adf614435b31d..9ae30e646fdbf 100644 --- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll +++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll @@ -602,19 +602,16 @@ define signext i32 @ctlz(i64 %b) nounwind { ; ; RV32I-LABEL: ctlz: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a4, a2, 1365 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a3, a2, 819 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: bnez a1, .LBB7_2 +; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -625,28 +622,26 @@ define signext i32 @ctlz(i64 %b) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s6, a1, -241 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: srli a0, s2, 1 -; RV32I-NEXT: or a0, s2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: addi a0, a0, 32 +; RV32I-NEXT: andi a0, a0, 63 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: srli a0, a1, 1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -657,41 +652,25 @@ define signext i32 @ctlz(i64 %b) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: bnez s0, .LBB7_2 -; RV32I-NEXT: # %bb.1: # %entry +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi s1, a0, 32 -; RV32I-NEXT: j .LBB7_3 -; RV32I-NEXT: .LBB7_2: -; RV32I-NEXT: srli s1, s1, 24 -; RV32I-NEXT: .LBB7_3: # %entry -; RV32I-NEXT: andi a0, s1, 63 -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 0(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: andi a0, a0, 63 ; RV32I-NEXT: ret ; ; RV64I-LABEL: ctlz: ; RV64I: # %bb.0: # %entry -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 2 @@ -727,15 +706,14 @@ define signext i32 @ctlz(i64 %b) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: slli a0, a0, 2 ; RV64I-NEXT: srli a0, a0, 58 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll index 3731b9719445e..b45ab135fa1c7 100644 --- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll @@ -11,8 +11,6 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: beqz a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: # %cond.false -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 @@ -40,12 +38,11 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 ; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB0_2: ; RV32I-NEXT: li a0, 32 @@ -64,19 +61,16 @@ declare i64 @llvm.ctlz.i64(i64, i1) define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: ctlz_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a4, a2, 1365 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a3, a2, 819 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: bnez a1, .LBB1_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -87,28 +81,26 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s6, a1, -241 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: srli a0, s2, 1 -; RV32I-NEXT: or a0, s2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: addi a0, a0, 32 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB1_2: +; RV32I-NEXT: srli a0, a1, 1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -119,35 +111,21 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: bnez s0, .LBB1_2 -; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: j .LBB1_3 -; RV32I-NEXT: .LBB1_2: -; RV32I-NEXT: srli a0, s1, 24 -; RV32I-NEXT: .LBB1_3: ; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 0(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; ; RV32XTHEADBB-LABEL: ctlz_i64: diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll index 36c107061795c..7e6c3f9c87d27 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -11,8 +11,6 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: beqz a0, .LBB0_2 ; RV32I-NEXT: # %bb.1: # %cond.false -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 @@ -40,12 +38,11 @@ define i32 @ctlz_i32(i32 %a) nounwind { ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 ; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB0_2: ; RV32I-NEXT: li a0, 32 @@ -64,19 +61,16 @@ declare i64 @llvm.ctlz.i64(i64, i1) define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-LABEL: ctlz_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 0(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi a4, a2, 1365 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a3, a2, 819 +; RV32I-NEXT: lui a2, 61681 +; RV32I-NEXT: addi a2, a2, -241 +; RV32I-NEXT: bnez a1, .LBB1_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -87,28 +81,26 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s4, a2, 1365 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s5, a1, 819 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s6, a1, -241 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s3, a1, 257 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: srli a0, s2, 1 -; RV32I-NEXT: or a0, s2, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: addi a0, a0, 32 +; RV32I-NEXT: li a1, 0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB1_2: +; RV32I-NEXT: srli a0, a1, 1 +; RV32I-NEXT: or a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 @@ -119,35 +111,21 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: and a1, a1, s4 +; RV32I-NEXT: and a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: and a1, a0, s5 +; RV32I-NEXT: and a1, a0, a3 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: and a0, a0, a3 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s6 -; RV32I-NEXT: mv a1, s3 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: bnez s0, .LBB1_2 -; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: addi a0, a0, 32 -; RV32I-NEXT: j .LBB1_3 -; RV32I-NEXT: .LBB1_2: -; RV32I-NEXT: srli a0, s1, 24 -; RV32I-NEXT: .LBB1_3: ; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 0(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctlz_i64: @@ -275,8 +253,6 @@ declare i32 @llvm.ctpop.i32(i32) define i32 @ctpop_i32(i32 %a) nounwind { ; RV32I-LABEL: ctpop_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: srli a1, a0, 1 ; RV32I-NEXT: lui a2, 349525 ; RV32I-NEXT: addi a2, a2, 1365 @@ -293,12 +269,11 @@ define i32 @ctpop_i32(i32 %a) nounwind { ; RV32I-NEXT: lui a1, 61681 ; RV32I-NEXT: addi a1, a1, -241 ; RV32I-NEXT: and a0, a0, a1 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi a1, a1, 257 -; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_i32: @@ -390,58 +365,42 @@ declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { ; RV32I-LABEL: ctpop_v2i32: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s3, a2, 1365 -; RV32I-NEXT: and a1, a1, s3 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s4, a1, 819 -; RV32I-NEXT: and a1, a0, s4 +; RV32I-NEXT: srli a2, a0, 1 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi a3, a3, 1365 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: sub a0, a0, a2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 +; RV32I-NEXT: and a4, a0, a2 ; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s5, a1, -241 -; RV32I-NEXT: and a0, a0, s5 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: srli s2, a0, 24 -; RV32I-NEXT: srli a0, s0, 1 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: sub s0, s0, a0 -; RV32I-NEXT: and a0, s0, s4 -; RV32I-NEXT: srli s0, s0, 2 -; RV32I-NEXT: and a1, s0, s4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s5 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: srli a1, a0, 24 -; RV32I-NEXT: mv a0, s2 -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: add a0, a4, a0 +; RV32I-NEXT: srli a4, a0, 4 +; RV32I-NEXT: add a0, a0, a4 +; RV32I-NEXT: lui a4, 61681 +; RV32I-NEXT: addi a4, a4, -241 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: slli a5, a0, 8 +; RV32I-NEXT: add a0, a0, a5 +; RV32I-NEXT: slli a5, a0, 16 +; RV32I-NEXT: add a0, a0, a5 +; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: srli a5, a1, 1 +; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: and a3, a1, a2 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: srli a2, a1, 4 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: slli a2, a1, 8 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: slli a2, a1, 16 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: srli a1, a1, 24 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_v2i32: @@ -558,59 +517,44 @@ declare i64 @llvm.ctpop.i64(i64) define i64 @ctpop_i64(i64 %a) nounwind { ; RV32I-LABEL: ctpop_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -32 -; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: srli a0, a1, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s2, a2, 1365 -; RV32I-NEXT: and a0, a0, s2 -; RV32I-NEXT: sub a1, a1, a0 -; RV32I-NEXT: lui a0, 209715 -; RV32I-NEXT: addi s3, a0, 819 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: srli a2, a1, 1 +; RV32I-NEXT: lui a3, 349525 +; RV32I-NEXT: addi a3, a3, 1365 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: lui a2, 209715 +; RV32I-NEXT: addi a2, a2, 819 +; RV32I-NEXT: and a4, a1, a2 ; RV32I-NEXT: srli a1, a1, 2 -; RV32I-NEXT: and a1, a1, s3 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s4, a1, -241 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: srli s5, a0, 24 -; RV32I-NEXT: srli a0, s0, 1 -; RV32I-NEXT: and a0, a0, s2 -; RV32I-NEXT: sub s0, s0, a0 -; RV32I-NEXT: and a0, s0, s3 -; RV32I-NEXT: srli s0, s0, 2 -; RV32I-NEXT: and a1, s0, s3 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: srli a4, a1, 4 +; RV32I-NEXT: add a1, a1, a4 +; RV32I-NEXT: lui a4, 61681 +; RV32I-NEXT: addi a4, a4, -241 +; RV32I-NEXT: and a1, a1, a4 +; RV32I-NEXT: slli a5, a1, 8 +; RV32I-NEXT: add a1, a1, a5 +; RV32I-NEXT: slli a5, a1, 16 +; RV32I-NEXT: add a1, a1, a5 +; RV32I-NEXT: srli a1, a1, 24 +; RV32I-NEXT: srli a5, a0, 1 +; RV32I-NEXT: and a3, a5, a3 +; RV32I-NEXT: sub a0, a0, a3 +; RV32I-NEXT: and a3, a0, a2 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: add a0, a3, a0 +; RV32I-NEXT: srli a2, a0, 4 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: add a0, a0, s5 +; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: li a1, 0 -; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 32 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_i64: @@ -738,99 +682,82 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV32I-LABEL: ctpop_v2i64: ; RV32I: # %bb.0: -; RV32I-NEXT: addi sp, sp, -48 -; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s0, 40(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s1, 36(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s4, 24(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s5, 20(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill -; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lw a0, 4(a1) -; RV32I-NEXT: lw s2, 8(a1) -; RV32I-NEXT: lw s5, 12(a1) -; RV32I-NEXT: lw s6, 0(a1) -; RV32I-NEXT: srli a1, a0, 1 -; RV32I-NEXT: lui a2, 349525 -; RV32I-NEXT: addi s3, a2, 1365 -; RV32I-NEXT: and a1, a1, s3 -; RV32I-NEXT: sub a0, a0, a1 -; RV32I-NEXT: lui a1, 209715 -; RV32I-NEXT: addi s4, a1, 819 -; RV32I-NEXT: and a1, a0, s4 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lui a1, 61681 -; RV32I-NEXT: addi s7, a1, -241 -; RV32I-NEXT: and a0, a0, s7 -; RV32I-NEXT: lui a1, 4112 -; RV32I-NEXT: addi s1, a1, 257 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: srli s8, a0, 24 -; RV32I-NEXT: srli a0, s6, 1 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: sub a0, s6, a0 -; RV32I-NEXT: and a1, a0, s4 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s7 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: add s8, a0, s8 -; RV32I-NEXT: srli a0, s5, 1 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: sub a0, s5, a0 -; RV32I-NEXT: and a1, a0, s4 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s7 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: srli s5, a0, 24 -; RV32I-NEXT: srli a0, s2, 1 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: sub a0, s2, a0 -; RV32I-NEXT: and a1, a0, s4 -; RV32I-NEXT: srli a0, a0, 2 -; RV32I-NEXT: and a0, a0, s4 -; RV32I-NEXT: add a0, a1, a0 -; RV32I-NEXT: srli a1, a0, 4 -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: and a0, a0, s7 -; RV32I-NEXT: mv a1, s1 -; RV32I-NEXT: call __mulsi3 -; RV32I-NEXT: srli a0, a0, 24 -; RV32I-NEXT: add a0, a0, s5 -; RV32I-NEXT: sw zero, 12(s0) -; RV32I-NEXT: sw zero, 4(s0) -; RV32I-NEXT: sw a0, 8(s0) -; RV32I-NEXT: sw s8, 0(s0) -; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s2, 32(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s3, 28(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s4, 24(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s5, 20(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s6, 16(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s7, 12(sp) # 4-byte Folded Reload -; RV32I-NEXT: lw s8, 8(sp) # 4-byte Folded Reload -; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a2, 8(a1) +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: srli a5, a3, 1 +; RV32I-NEXT: lui a6, 349525 +; RV32I-NEXT: addi a6, a6, 1365 +; RV32I-NEXT: and a5, a5, a6 +; RV32I-NEXT: sub a3, a3, a5 +; RV32I-NEXT: lui a5, 209715 +; RV32I-NEXT: addi a5, a5, 819 +; RV32I-NEXT: and a7, a3, a5 +; RV32I-NEXT: srli a3, a3, 2 +; RV32I-NEXT: and a3, a3, a5 +; RV32I-NEXT: add a3, a7, a3 +; RV32I-NEXT: srli a7, a3, 4 +; RV32I-NEXT: add a3, a3, a7 +; RV32I-NEXT: lui a7, 61681 +; RV32I-NEXT: addi a7, a7, -241 +; RV32I-NEXT: and a3, a3, a7 +; RV32I-NEXT: slli t0, a3, 8 +; RV32I-NEXT: add a3, a3, t0 +; RV32I-NEXT: slli t0, a3, 16 +; RV32I-NEXT: add a3, a3, t0 +; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: srli t0, a1, 1 +; RV32I-NEXT: and t0, t0, a6 +; RV32I-NEXT: sub a1, a1, t0 +; RV32I-NEXT: and t0, a1, a5 +; RV32I-NEXT: srli a1, a1, 2 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: add a1, t0, a1 +; RV32I-NEXT: srli t0, a1, 4 +; RV32I-NEXT: add a1, a1, t0 +; RV32I-NEXT: and a1, a1, a7 +; RV32I-NEXT: slli t0, a1, 8 +; RV32I-NEXT: add a1, a1, t0 +; RV32I-NEXT: slli t0, a1, 16 +; RV32I-NEXT: add a1, a1, t0 +; RV32I-NEXT: srli a1, a1, 24 +; RV32I-NEXT: add a1, a1, a3 +; RV32I-NEXT: srli a3, a4, 1 +; RV32I-NEXT: and a3, a3, a6 +; RV32I-NEXT: sub a4, a4, a3 +; RV32I-NEXT: and a3, a4, a5 +; RV32I-NEXT: srli a4, a4, 2 +; RV32I-NEXT: and a4, a4, a5 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: srli a4, a3, 4 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: and a3, a3, a7 +; RV32I-NEXT: slli a4, a3, 8 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: slli a4, a3, 16 +; RV32I-NEXT: add a3, a3, a4 +; RV32I-NEXT: srli a3, a3, 24 +; RV32I-NEXT: srli a4, a2, 1 +; RV32I-NEXT: and a4, a4, a6 +; RV32I-NEXT: sub a2, a2, a4 +; RV32I-NEXT: and a4, a2, a5 +; RV32I-NEXT: srli a2, a2, 2 +; RV32I-NEXT: and a2, a2, a5 +; RV32I-NEXT: add a2, a4, a2 +; RV32I-NEXT: srli a4, a2, 4 +; RV32I-NEXT: add a2, a2, a4 +; RV32I-NEXT: and a2, a2, a7 +; RV32I-NEXT: slli a4, a2, 8 +; RV32I-NEXT: add a2, a2, a4 +; RV32I-NEXT: slli a4, a2, 16 +; RV32I-NEXT: add a2, a2, a4 +; RV32I-NEXT: srli a2, a2, 24 +; RV32I-NEXT: add a2, a2, a3 +; RV32I-NEXT: sw zero, 12(a0) +; RV32I-NEXT: sw zero, 4(a0) +; RV32I-NEXT: sw a2, 8(a0) +; RV32I-NEXT: sw a1, 0(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: ctpop_v2i64: diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll index 73bfc6480b4d7..acd63f24bb8f7 100644 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll @@ -317,8 +317,6 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 2 @@ -354,14 +352,13 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB5_2: ; RV64I-NEXT: li a0, 64 diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll index 7feef4dad4116..b0e447b71178b 100644 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll @@ -307,8 +307,6 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 2 @@ -344,14 +342,13 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB5_2: ; RV64I-NEXT: li a0, 64 @@ -623,8 +620,6 @@ declare i64 @llvm.ctpop.i64(i64) define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I-LABEL: ctpop_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 @@ -647,14 +642,13 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_i64: diff --git a/llvm/test/CodeGen/RISCV/rv64-typepromotion.ll b/llvm/test/CodeGen/RISCV/rv64-typepromotion.ll new file mode 100644 index 0000000000000..23eae33739c97 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rv64-typepromotion.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -mtriple=riscv64 -passes=typepromotion -S %s | FileCheck %s + +; Test that this does not crash +define i16 @test(i8 %a, i32 %b) { +; CHECK-LABEL: define i16 @test( +; CHECK-SAME: i8 [[A:%.*]], i32 [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[B]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP0]], 255 +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i16 +; CHECK-NEXT: ret i16 [[TMP7]] +; +entry: + %0 = zext i8 %a to i32 + %1 = trunc i32 %b to i16 + %2 = icmp eq i16 %1, 0 + %3 = trunc i32 %0 to i8 + %4 = zext i8 %3 to i16 + %5 = xor i16 %4, %1 + ret i16 %5 +} diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index 1f62ea9f56819..6cdab888ffcde 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -11,8 +11,6 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB0_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 @@ -38,14 +36,13 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB0_2: ; RV64I-NEXT: li a0, 32 @@ -66,8 +63,6 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB1_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 @@ -93,14 +88,13 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: j .LBB1_3 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: li a0, 32 @@ -125,50 +119,45 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-LABEL: log2_ceil_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: addiw a0, a0, -1 -; RV64I-NEXT: li s0, 32 -; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: beqz a0, .LBB2_2 +; RV64I-NEXT: addiw a1, a0, -1 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: li a2, 32 +; RV64I-NEXT: beqz a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: srliw a2, a1, 1 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 2 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 4 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 8 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 16 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: not a1, a1 +; RV64I-NEXT: srli a2, a1, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: srli a1, a1, 2 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a1, a0, 24 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: srli a2, a1, 4 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 8 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 16 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 24 ; RV64I-NEXT: .LBB2_2: # %cond.end -; RV64I-NEXT: sub a0, s0, a1 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: ret ; ; RV64XTHEADBB-LABEL: log2_ceil_i32: @@ -189,48 +178,42 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findLastSet_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: srliw a0, a0, 1 -; RV64I-NEXT: or a0, s0, a0 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: or a1, a0, a1 +; RV64I-NEXT: srliw a2, a1, 2 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 4 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 8 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 16 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: not a1, a1 +; RV64I-NEXT: srli a2, a1, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: srli a1, a1, 2 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: xori a0, a0, 31 -; RV64I-NEXT: snez a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: srli a2, a1, 4 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 8 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 16 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srliw a1, a1, 24 +; RV64I-NEXT: xori a1, a1, 31 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64XTHEADBB-LABEL: findLastSet_i32: @@ -256,10 +239,6 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: srliw a0, a0, 1 ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: .cfi_def_cfa_offset 16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 @@ -285,14 +264,13 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB4_2: ; RV64I-NEXT: li a0, 32 @@ -317,8 +295,6 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 2 @@ -354,14 +330,13 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB5_2: ; RV64I-NEXT: li a0, 64 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll index 2269d8d04c9cb..4d5ef5db86057 100644 --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -11,8 +11,6 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB0_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 @@ -38,14 +36,13 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB0_2: ; RV64I-NEXT: li a0, 32 @@ -64,8 +61,6 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB1_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 @@ -91,14 +86,13 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: j .LBB1_3 ; RV64I-NEXT: .LBB1_2: ; RV64I-NEXT: li a0, 32 @@ -121,50 +115,45 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-LABEL: log2_ceil_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: addiw a0, a0, -1 -; RV64I-NEXT: li s0, 32 -; RV64I-NEXT: li a1, 32 -; RV64I-NEXT: beqz a0, .LBB2_2 +; RV64I-NEXT: addiw a1, a0, -1 +; RV64I-NEXT: li a0, 32 +; RV64I-NEXT: li a2, 32 +; RV64I-NEXT: beqz a1, .LBB2_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: srliw a2, a1, 1 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 2 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 4 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 8 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 16 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: not a1, a1 +; RV64I-NEXT: srli a2, a1, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: srli a1, a1, 2 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a1, a0, 24 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: srli a2, a1, 4 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 8 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 16 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 24 ; RV64I-NEXT: .LBB2_2: # %cond.end -; RV64I-NEXT: sub a0, s0, a1 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: log2_ceil_i32: @@ -183,48 +172,42 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ; RV64I-LABEL: findLastSet_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: srliw a0, a0, 1 -; RV64I-NEXT: or a0, s0, a0 -; RV64I-NEXT: srliw a1, a0, 2 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 4 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 8 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: srliw a1, a0, 16 -; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: not a0, a0 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: or a1, a0, a1 +; RV64I-NEXT: srliw a2, a1, 2 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 4 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 8 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: srliw a2, a1, 16 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: not a1, a1 +; RV64I-NEXT: srli a2, a1, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: srli a1, a1, 2 ; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: and a2, a0, a1 -; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: add a0, a2, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 -; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: xori a0, a0, 31 -; RV64I-NEXT: snez a1, s0 -; RV64I-NEXT: addi a1, a1, -1 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: srli a2, a1, 4 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: lui a2, 61681 +; RV64I-NEXT: addi a2, a2, -241 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 8 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 16 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srliw a1, a1, 24 +; RV64I-NEXT: xori a1, a1, 31 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: findLastSet_i32: @@ -248,10 +231,6 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: srliw a0, a0, 1 ; RV64I-NEXT: beqz a0, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: .cfi_def_cfa_offset 16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: .cfi_offset ra, -8 ; RV64I-NEXT: srliw a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 2 @@ -277,14 +256,13 @@ define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB4_2: ; RV64I-NEXT: li a0, 32 @@ -307,8 +285,6 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I: # %bb.0: ; RV64I-NEXT: beqz a0, .LBB5_2 ; RV64I-NEXT: # %bb.1: # %cond.false -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 2 @@ -344,14 +320,13 @@ define i64 @ctlz_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB5_2: ; RV64I-NEXT: li a0, 64 @@ -544,8 +519,6 @@ declare i32 @llvm.ctpop.i32(i32) define signext i32 @ctpop_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ctpop_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 @@ -560,14 +533,13 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_i32: @@ -657,8 +629,6 @@ define i1 @ctpop_i32_ne_one(i32 signext %a) nounwind { define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I-LABEL: ctpop_i32_load: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lw a0, 0(a0) ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 @@ -674,14 +644,13 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: addi a1, a1, -241 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 24 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_i32_load: @@ -699,58 +668,42 @@ declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { ; RV64I-LABEL: ctpop_v2i32: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a1 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw s3, a2, 1365 -; RV64I-NEXT: and a1, a1, s3 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw s4, a1, 819 -; RV64I-NEXT: and a1, a0, s4 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: and a4, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, s4 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw s5, a1, -241 -; RV64I-NEXT: and a0, a0, s5 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw s1, a1, 257 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw s2, a0, 24 -; RV64I-NEXT: srli a0, s0, 1 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: sub s0, s0, a0 -; RV64I-NEXT: and a0, s0, s4 -; RV64I-NEXT: srli s0, s0, 2 -; RV64I-NEXT: and a1, s0, s4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: and a0, a0, s5 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srliw a1, a0, 24 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a4, a0 +; RV64I-NEXT: srli a4, a0, 4 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: lui a4, 61681 +; RV64I-NEXT: addi a4, a4, -241 +; RV64I-NEXT: and a0, a0, a4 +; RV64I-NEXT: slli a5, a0, 8 +; RV64I-NEXT: add a0, a0, a5 +; RV64I-NEXT: slli a5, a0, 16 +; RV64I-NEXT: add a0, a0, a5 +; RV64I-NEXT: srliw a0, a0, 24 +; RV64I-NEXT: srli a5, a1, 1 +; RV64I-NEXT: and a3, a5, a3 +; RV64I-NEXT: sub a1, a1, a3 +; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: srli a1, a1, 2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: srli a2, a1, 4 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: and a1, a1, a4 +; RV64I-NEXT: slli a2, a1, 8 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 16 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srliw a1, a1, 24 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_v2i32: @@ -875,8 +828,6 @@ declare i64 @llvm.ctpop.i64(i64) define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I-LABEL: ctpop_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -16 -; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: srli a1, a0, 1 ; RV64I-NEXT: lui a2, 349525 ; RV64I-NEXT: addiw a2, a2, 1365 @@ -899,14 +850,13 @@ define i64 @ctpop_i64(i64 %a) nounwind { ; RV64I-NEXT: slli a2, a1, 32 ; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: and a0, a0, a1 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw a1, a1, 257 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: call __muldi3 +; RV64I-NEXT: slli a1, a0, 8 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 16 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: slli a1, a0, 32 +; RV64I-NEXT: add a0, a0, a1 ; RV64I-NEXT: srli a0, a0, 56 -; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 16 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_i64: @@ -998,66 +948,52 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { ; RV64I-LABEL: ctpop_v2i64: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -64 -; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s0, a1 -; RV64I-NEXT: srli a1, a0, 1 -; RV64I-NEXT: lui a2, 349525 -; RV64I-NEXT: addiw a2, a2, 1365 -; RV64I-NEXT: slli a3, a2, 32 -; RV64I-NEXT: add s3, a2, a3 -; RV64I-NEXT: and a1, a1, s3 -; RV64I-NEXT: sub a0, a0, a1 -; RV64I-NEXT: lui a1, 209715 -; RV64I-NEXT: addiw a1, a1, 819 -; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add s4, a1, a2 -; RV64I-NEXT: and a1, a0, s4 +; RV64I-NEXT: srli a2, a0, 1 +; RV64I-NEXT: lui a3, 349525 +; RV64I-NEXT: addiw a3, a3, 1365 +; RV64I-NEXT: slli a4, a3, 32 +; RV64I-NEXT: add a3, a3, a4 +; RV64I-NEXT: and a2, a2, a3 +; RV64I-NEXT: sub a0, a0, a2 +; RV64I-NEXT: lui a2, 209715 +; RV64I-NEXT: addiw a2, a2, 819 +; RV64I-NEXT: slli a4, a2, 32 +; RV64I-NEXT: add a2, a2, a4 +; RV64I-NEXT: and a4, a0, a2 ; RV64I-NEXT: srli a0, a0, 2 -; RV64I-NEXT: and a0, a0, s4 -; RV64I-NEXT: add a0, a1, a0 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: lui a1, 61681 -; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: add a0, a4, a0 +; RV64I-NEXT: srli a4, a0, 4 +; RV64I-NEXT: add a0, a0, a4 +; RV64I-NEXT: lui a4, 61681 +; RV64I-NEXT: addiw a4, a4, -241 +; RV64I-NEXT: slli a5, a4, 32 +; RV64I-NEXT: add a4, a4, a5 +; RV64I-NEXT: and a0, a0, a4 +; RV64I-NEXT: slli a5, a0, 8 +; RV64I-NEXT: add a0, a0, a5 +; RV64I-NEXT: slli a5, a0, 16 +; RV64I-NEXT: add a0, a0, a5 +; RV64I-NEXT: slli a5, a0, 32 +; RV64I-NEXT: add a0, a0, a5 +; RV64I-NEXT: srli a0, a0, 56 +; RV64I-NEXT: srli a5, a1, 1 +; RV64I-NEXT: and a3, a5, a3 +; RV64I-NEXT: sub a1, a1, a3 +; RV64I-NEXT: and a3, a1, a2 +; RV64I-NEXT: srli a1, a1, 2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: add a1, a3, a1 +; RV64I-NEXT: srli a2, a1, 4 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: and a1, a1, a4 +; RV64I-NEXT: slli a2, a1, 8 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: slli a2, a1, 16 +; RV64I-NEXT: add a1, a1, a2 ; RV64I-NEXT: slli a2, a1, 32 -; RV64I-NEXT: add s5, a1, a2 -; RV64I-NEXT: and a0, a0, s5 -; RV64I-NEXT: lui a1, 4112 -; RV64I-NEXT: addiw s1, a1, 257 -; RV64I-NEXT: slli a1, s1, 32 -; RV64I-NEXT: add s1, s1, a1 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srli s2, a0, 56 -; RV64I-NEXT: srli a0, s0, 1 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: sub s0, s0, a0 -; RV64I-NEXT: and a0, s0, s4 -; RV64I-NEXT: srli s0, s0, 2 -; RV64I-NEXT: and a1, s0, s4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: and a0, a0, s5 -; RV64I-NEXT: mv a1, s1 -; RV64I-NEXT: call __muldi3 -; RV64I-NEXT: srli a1, a0, 56 -; RV64I-NEXT: mv a0, s2 -; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 16(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 8(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: add a1, a1, a2 +; RV64I-NEXT: srli a1, a1, 56 ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: ctpop_v2i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll index 571e2df13c263..7cc4a9da3d429 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-vops.ll @@ -1192,14 +1192,25 @@ define @vmerge_larger_vl_false_becomes_tail( @vpmerge_vwsub.w_tied( %passthru, %x, %y, %mask, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vwsub.w_tied: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vmv2r.v v10, v8 -; CHECK-NEXT: vwsub.wv v10, v10, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma -; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu +; CHECK-NEXT: vwsub.wv v8, v8, v12, v0.t ; CHECK-NEXT: ret %vl.zext = zext i32 %vl to i64 %a = call @llvm.riscv.vwsub.w.nxv2i64.nxv2i32( %passthru, %passthru, %y, i64 %vl.zext) %b = call @llvm.vp.merge.nxv2i64( %mask, %a, %passthru, i32 %vl) ret %b } + +define @vpmerge_vfwsub.w_tied( %passthru, %x, %y, %mask, i32 zeroext %vl) { +; CHECK-LABEL: vpmerge_vfwsub.w_tied: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu +; CHECK-NEXT: fsrmi a0, 1 +; CHECK-NEXT: vfwsub.wv v8, v8, v12, v0.t +; CHECK-NEXT: fsrm a0 +; CHECK-NEXT: ret + %vl.zext = zext i32 %vl to i64 + %a = call @llvm.riscv.vfwsub.w.nxv2f64.nxv2f32( %passthru, %passthru, %y, i64 1, i64 %vl.zext) + %b = call @llvm.vp.merge.nxv2f64( %mask, %a, %passthru, i32 %vl) + ret %b +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll index 36bc10f055b84..66e6883dd1d3e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll @@ -1396,16 +1396,14 @@ define @i1_zext( %va, %vb ; %x.i32 and %y.i32 are disjoint, so DAGCombiner will combine it into an or. ; FIXME: We should be able to recover the or into vwaddu.vv if the disjoint ; flag is set. -define @disjoint_or( %x.i8, %y.i8) { -; CHECK-LABEL: disjoint_or: +define @vwaddu_vv_disjoint_or_add( %x.i8, %y.i8) { +; CHECK-LABEL: vwaddu_vv_disjoint_or_add: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vsll.vi v8, v10, 8 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vzext.vf4 v8, v9 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vsll.vi v10, v10, 8 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vwaddu.vv v8, v10, v11 ; CHECK-NEXT: ret %x.i16 = zext %x.i8 to %x.shl = shl %x.i16, shufflevector( insertelement( poison, i16 8, i32 0), poison, zeroinitializer) @@ -1414,3 +1412,57 @@ define @disjoint_or( %x.i8, %x.i32, %y.i32 ret %add } + +; TODO: We could select vwaddu.vv, but when both arms of the or are the same +; DAGCombiner::hoistLogicOpWithSameOpcodeHands moves the zext above the or. +define @vwaddu_vv_disjoint_or( %x.i16, %y.i16) { +; CHECK-LABEL: vwaddu_vv_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vor.vv v9, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: ret + %x.i32 = zext %x.i16 to + %y.i32 = zext %y.i16 to + %or = or disjoint %x.i32, %y.i32 + ret %or +} + +; TODO: We could select vwadd.vv, but when both arms of the or are the same +; DAGCombiner::hoistLogicOpWithSameOpcodeHands moves the zext above the or. +define @vwadd_vv_disjoint_or( %x.i16, %y.i16) { +; CHECK-LABEL: vwadd_vv_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vor.vv v9, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: ret + %x.i32 = sext %x.i16 to + %y.i32 = sext %y.i16 to + %or = or disjoint %x.i32, %y.i32 + ret %or +} + +define @vwaddu_wv_disjoint_or( %x.i32, %y.i16) { +; CHECK-LABEL: vwaddu_wv_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vwaddu.wv v8, v8, v9 +; CHECK-NEXT: ret + %y.i32 = zext %y.i16 to + %or = or disjoint %x.i32, %y.i32 + ret %or +} + +define @vwadd_wv_disjoint_or( %x.i32, %y.i16) { +; CHECK-LABEL: vwadd_wv_disjoint_or: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vwadd.wv v8, v8, v9 +; CHECK-NEXT: ret + %y.i32 = sext %y.i16 to + %or = or disjoint %x.i32, %y.i32 + ret %or +} diff --git a/llvm/test/MC/Disassembler/X86/apx/imulzu.txt b/llvm/test/MC/Disassembler/X86/apx/imulzu.txt new file mode 100644 index 0000000000000..86142e0540970 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/apx/imulzu.txt @@ -0,0 +1,50 @@ +# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT +# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL + +# ATT: imulzuw $123, %dx, %dx +# INTEL: imulzu dx, dx, 123 +0x62,0xf4,0x7d,0x18,0x6b,0xd2,0x7b + +# ATT: imulzul $123, %ecx, %ecx +# INTEL: imulzu ecx, ecx, 123 +0x62,0xf4,0x7c,0x18,0x6b,0xc9,0x7b + +# ATT: imulzuq $123, %r9, %r9 +# INTEL: imulzu r9, r9, 123 +0x62,0x54,0xfc,0x18,0x6b,0xc9,0x7b + +# ATT: imulzuw $123, 291(%r8,%rax,4), %dx +# INTEL: imulzu dx, word ptr [r8 + 4*rax + 291], 123 +0x62,0xd4,0x7d,0x18,0x6b,0x94,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: imulzul $123, 291(%r8,%rax,4), %ecx +# INTEL: imulzu ecx, dword ptr [r8 + 4*rax + 291], 123 +0x62,0xd4,0x7c,0x18,0x6b,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: imulzuq $123, 291(%r8,%rax,4), %r9 +# INTEL: imulzu r9, qword ptr [r8 + 4*rax + 291], 123 +0x62,0x54,0xfc,0x18,0x6b,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b + +# ATT: imulzuw $1234, %dx, %dx +# INTEL: imulzu dx, dx, 1234 +0x62,0xf4,0x7d,0x18,0x69,0xd2,0xd2,0x04 + +# ATT: imulzuw $1234, 291(%r8,%rax,4), %dx +# INTEL: imulzu dx, word ptr [r8 + 4*rax + 291], 1234 +0x62,0xd4,0x7d,0x18,0x69,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04 + +# ATT: imulzul $123456, %ecx, %ecx +# INTEL: imulzu ecx, ecx, 123456 +0x62,0xf4,0x7c,0x18,0x69,0xc9,0x40,0xe2,0x01,0x00 + +# ATT: imulzuq $123456, %r9, %r9 +# INTEL: imulzu r9, r9, 123456 +0x62,0x54,0xfc,0x18,0x69,0xc9,0x40,0xe2,0x01,0x00 + +# ATT: imulzul $123456, 291(%r8,%rax,4), %ecx +# INTEL: imulzu ecx, dword ptr [r8 + 4*rax + 291], 123456 +0x62,0xd4,0x7c,0x18,0x69,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00 + +# ATT: imulzuq $123456, 291(%r8,%rax,4), %r9 +# INTEL: imulzu r9, qword ptr [r8 + 4*rax + 291], 123456 +0x62,0x54,0xfc,0x18,0x69,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00 diff --git a/llvm/test/MC/X86/apx/imulzu-att.s b/llvm/test/MC/X86/apx/imulzu-att.s new file mode 100644 index 0000000000000..f56bfa77e1ce2 --- /dev/null +++ b/llvm/test/MC/X86/apx/imulzu-att.s @@ -0,0 +1,41 @@ +# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s +# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR + +# ERROR-COUNT-12: error: +# ERROR-NOT: error: +# CHECK: imulzuw $123, %dx, %dx +# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x6b,0xd2,0x7b] + imulzuw $123, %dx, %dx +# CHECK: imulzul $123, %ecx, %ecx +# CHECK: encoding: [0x62,0xf4,0x7c,0x18,0x6b,0xc9,0x7b] + imulzul $123, %ecx, %ecx +# CHECK: imulzuq $123, %r9, %r9 +# CHECK: encoding: [0x62,0x54,0xfc,0x18,0x6b,0xc9,0x7b] + imulzuq $123, %r9, %r9 +# CHECK: imulzuw $123, 291(%r8,%rax,4), %dx +# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x6b,0x94,0x80,0x23,0x01,0x00,0x00,0x7b] + imulzuw $123, 291(%r8,%rax,4), %dx +# CHECK: imulzul $123, 291(%r8,%rax,4), %ecx +# CHECK: encoding: [0x62,0xd4,0x7c,0x18,0x6b,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b] + imulzul $123, 291(%r8,%rax,4), %ecx +# CHECK: imulzuq $123, 291(%r8,%rax,4), %r9 +# CHECK: encoding: [0x62,0x54,0xfc,0x18,0x6b,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b] + imulzuq $123, 291(%r8,%rax,4), %r9 +# CHECK: imulzuw $1234, %dx, %dx +# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x69,0xd2,0xd2,0x04] + imulzuw $1234, %dx, %dx +# CHECK: imulzuw $1234, 291(%r8,%rax,4), %dx +# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x69,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04] + imulzuw $1234, 291(%r8,%rax,4), %dx +# CHECK: imulzul $123456, %ecx, %ecx +# CHECK: encoding: [0x62,0xf4,0x7c,0x18,0x69,0xc9,0x40,0xe2,0x01,0x00] + imulzul $123456, %ecx, %ecx +# CHECK: imulzuq $123456, %r9, %r9 +# CHECK: encoding: [0x62,0x54,0xfc,0x18,0x69,0xc9,0x40,0xe2,0x01,0x00] + imulzuq $123456, %r9, %r9 +# CHECK: imulzul $123456, 291(%r8,%rax,4), %ecx +# CHECK: encoding: [0x62,0xd4,0x7c,0x18,0x69,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00] + imulzul $123456, 291(%r8,%rax,4), %ecx +# CHECK: imulzuq $123456, 291(%r8,%rax,4), %r9 +# CHECK: encoding: [0x62,0x54,0xfc,0x18,0x69,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00] + imulzuq $123456, 291(%r8,%rax,4), %r9 diff --git a/llvm/test/MC/X86/apx/imulzu-intel.s b/llvm/test/MC/X86/apx/imulzu-intel.s new file mode 100644 index 0000000000000..3a01fdca14895 --- /dev/null +++ b/llvm/test/MC/X86/apx/imulzu-intel.s @@ -0,0 +1,38 @@ +# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s + +# CHECK: imulzu dx, dx, 123 +# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x6b,0xd2,0x7b] + imulzu dx, dx, 123 +# CHECK: imulzu ecx, ecx, 123 +# CHECK: encoding: [0x62,0xf4,0x7c,0x18,0x6b,0xc9,0x7b] + imulzu ecx, ecx, 123 +# CHECK: imulzu r9, r9, 123 +# CHECK: encoding: [0x62,0x54,0xfc,0x18,0x6b,0xc9,0x7b] + imulzu r9, r9, 123 +# CHECK: imulzu dx, word ptr [r8 + 4*rax + 291], 123 +# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x6b,0x94,0x80,0x23,0x01,0x00,0x00,0x7b] + imulzu dx, word ptr [r8 + 4*rax + 291], 123 +# CHECK: imulzu ecx, dword ptr [r8 + 4*rax + 291], 123 +# CHECK: encoding: [0x62,0xd4,0x7c,0x18,0x6b,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b] + imulzu ecx, dword ptr [r8 + 4*rax + 291], 123 +# CHECK: imulzu r9, qword ptr [r8 + 4*rax + 291], 123 +# CHECK: encoding: [0x62,0x54,0xfc,0x18,0x6b,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b] + imulzu r9, qword ptr [r8 + 4*rax + 291], 123 +# CHECK: imulzu dx, dx, 1234 +# CHECK: encoding: [0x62,0xf4,0x7d,0x18,0x69,0xd2,0xd2,0x04] + imulzu dx, dx, 1234 +# CHECK: imulzu dx, word ptr [r8 + 4*rax + 291], 1234 +# CHECK: encoding: [0x62,0xd4,0x7d,0x18,0x69,0x94,0x80,0x23,0x01,0x00,0x00,0xd2,0x04] + imulzu dx, word ptr [r8 + 4*rax + 291], 1234 +# CHECK: imulzu ecx, ecx, 123456 +# CHECK: encoding: [0x62,0xf4,0x7c,0x18,0x69,0xc9,0x40,0xe2,0x01,0x00] + imulzu ecx, ecx, 123456 +# CHECK: imulzu r9, r9, 123456 +# CHECK: encoding: [0x62,0x54,0xfc,0x18,0x69,0xc9,0x40,0xe2,0x01,0x00] + imulzu r9, r9, 123456 +# CHECK: imulzu ecx, dword ptr [r8 + 4*rax + 291], 123456 +# CHECK: encoding: [0x62,0xd4,0x7c,0x18,0x69,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00] + imulzu ecx, dword ptr [r8 + 4*rax + 291], 123456 +# CHECK: imulzu r9, qword ptr [r8 + 4*rax + 291], 123456 +# CHECK: encoding: [0x62,0x54,0xfc,0x18,0x69,0x8c,0x80,0x23,0x01,0x00,0x00,0x40,0xe2,0x01,0x00] + imulzu r9, qword ptr [r8 + 4*rax + 291], 123456 diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td index 15bd5cce52e8e..391c7ad15718f 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td @@ -114,14 +114,17 @@ def MyCombiner: GICombiner<"GenMyCombiner", [ // CHECK-NEXT: void GenMyCombiner::runCustomAction(unsigned ApplyID, const MatcherState &State, NewMIVector &OutMIs) const { // CHECK-NEXT: switch(ApplyID) { // CHECK-NEXT: case GICXXCustomAction_CombineApplyGICombiner0:{ +// CHECK-NEXT: Helper.getBuilder().setInstrAndDebugLoc(*State.MIs[0]); // CHECK-NEXT: APPLY // CHECK-NEXT: return; // CHECK-NEXT: } // CHECK-NEXT: case GICXXCustomAction_CombineApplyGICombiner1:{ +// CHECK-NEXT: Helper.getBuilder().setInstrAndDebugLoc(*State.MIs[0]); // CHECK-NEXT: APPLY MatchInfos.MDInfo0, MatchInfos.MDInfo1 // CHECK-NEXT: return; // CHECK-NEXT: } // CHECK-NEXT: case GICXXCustomAction_CombineApplyGICombiner2:{ +// CHECK-NEXT: Helper.getBuilder().setInstrAndDebugLoc(*State.MIs[0]); // CHECK-NEXT: APPLY State.MIs[1]->getOperand(1) State.MIs[0]->getOperand(1) OutMIs[0] // CHECK-NEXT: return; // CHECK-NEXT: } diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc index 7b65e483c39d0..4ab5567f62876 100644 --- a/llvm/test/TableGen/x86-fold-tables.inc +++ b/llvm/test/TableGen/x86-fold-tables.inc @@ -756,6 +756,12 @@ static const X86FoldTableEntry Table1[] = { {X86::IMUL64rri32_NF, X86::IMUL64rmi32_NF, 0}, {X86::IMUL64rri8, X86::IMUL64rmi8, 0}, {X86::IMUL64rri8_NF, X86::IMUL64rmi8_NF, 0}, + {X86::IMULZU16rri, X86::IMULZU16rmi, 0}, + {X86::IMULZU16rri8, X86::IMULZU16rmi8, 0}, + {X86::IMULZU32rri, X86::IMULZU32rmi, 0}, + {X86::IMULZU32rri8, X86::IMULZU32rmi8, 0}, + {X86::IMULZU64rri32, X86::IMULZU64rmi32, 0}, + {X86::IMULZU64rri8, X86::IMULZU64rmi8, 0}, {X86::INC16r_ND, X86::INC16m_ND, 0}, {X86::INC16r_NF_ND, X86::INC16m_NF_ND, 0}, {X86::INC32r_ND, X86::INC32m_ND, 0}, diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index b6e6b26024952..a5d4c329446f9 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -3028,7 +3028,7 @@ define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %p define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 { ; CI-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; CI-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]] ; CI-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 ; CI-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 ; CI-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 @@ -3041,7 +3041,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; CI-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; CI-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CI-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; CI-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; CI-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] ; CI-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; CI-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 ; CI-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] @@ -3058,7 +3058,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; CI-NEXT: ret bfloat [[TMP7]] ; ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX9-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]] ; GFX9-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 ; GFX9-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 ; GFX9-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 @@ -3071,7 +3071,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX9-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX9-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX9-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX9-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; GFX9-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] ; GFX9-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX9-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 ; GFX9-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] @@ -3088,7 +3088,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX9-NEXT: ret bfloat [[TMP7]] ; ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX908-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]] ; GFX908-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 ; GFX908-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 ; GFX908-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 @@ -3101,7 +3101,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX908-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX908-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX908-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX908-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; GFX908-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] ; GFX908-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX908-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 ; GFX908-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] @@ -3118,7 +3118,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX908-NEXT: ret bfloat [[TMP7]] ; ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX90A-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6:[0-9]+]] ; GFX90A-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 ; GFX90A-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 ; GFX90A-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 @@ -3131,7 +3131,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX90A-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX90A-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX90A-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX90A-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; GFX90A-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] ; GFX90A-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX90A-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 ; GFX90A-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] @@ -3148,7 +3148,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX90A-NEXT: ret bfloat [[TMP7]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX940-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6:[0-9]+]] ; GFX940-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 ; GFX940-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 ; GFX940-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 @@ -3161,7 +3161,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX940-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX940-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX940-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX940-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; GFX940-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] ; GFX940-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX940-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 ; GFX940-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] @@ -3178,7 +3178,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX940-NEXT: ret bfloat [[TMP7]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp( -; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) +; GFX11-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR6]] ; GFX11-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32 ; GFX11-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3 ; GFX11-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3 @@ -3191,7 +3191,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf ; GFX11-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]] ; GFX11-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; GFX11-NEXT: [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat -; GFX11-NEXT: [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]] +; GFX11-NEXT: [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR6]] ; GFX11-NEXT: [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16 ; GFX11-NEXT: [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32 ; GFX11-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]] diff --git a/llvm/test/Transforms/FunctionAttrs/noundef.ll b/llvm/test/Transforms/FunctionAttrs/noundef.ll index 946b562f39553..9ab37082a3032 100644 --- a/llvm/test/Transforms/FunctionAttrs/noundef.ll +++ b/llvm/test/Transforms/FunctionAttrs/noundef.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 ; RUN: opt < %s -passes='function-attrs' -S | FileCheck %s +@g_var = external global [0 x i8] + define i32 @test_ret_constant() { ; CHECK-LABEL: define noundef i32 @test_ret_constant( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { @@ -152,3 +154,15 @@ define i32 @test_ret_constant_msan() sanitize_memory { ; ret i32 0 } + +define i64 @test_trunc_with_constexpr() { +; CHECK-LABEL: define noundef i64 @test_trunc_with_constexpr( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ADD:%.*]] = add i32 trunc (i64 sub (i64 0, i64 ptrtoint (ptr @g_var to i64)) to i32), 1 +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: ret i64 [[CONV]] +; + %add = add i32 trunc (i64 sub (i64 0, i64 ptrtoint (ptr @g_var to i64)) to i32), 1 + %conv = sext i32 %add to i64 + ret i64 %conv +} diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll index 9b990480709c9..80d8e1b16ed28 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll @@ -39,10 +39,9 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP5]], float [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -117,10 +116,9 @@ define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP5]], double [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -191,10 +189,9 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP5]], float [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -269,10 +266,9 @@ define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP5]], double [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -343,10 +339,9 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP5]], float [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -421,10 +416,9 @@ define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP5]], double [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -495,10 +489,9 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP5]], float [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -573,10 +566,9 @@ define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP5]], double [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -981,9 +973,8 @@ define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x flo ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP1]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -1011,9 +1002,8 @@ define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP1]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP1]] ; CHECK-NEXT: ret float [[TMP6]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 @@ -1060,9 +1050,8 @@ define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP1]], double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -1086,9 +1075,8 @@ define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x doub ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP1]], double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP1]] ; CHECK-NEXT: ret double [[TMP6]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 @@ -1129,9 +1117,8 @@ define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float 0.000000e+00, float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float 0.000000e+00 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -1159,9 +1146,8 @@ define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float 0.000000e+00, float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float 0.000000e+00 ; CHECK-NEXT: ret float [[TMP6]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 @@ -1206,9 +1192,8 @@ define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double 0.000000e+00, double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double 0.000000e+00 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -1232,9 +1217,8 @@ define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double 0.000000e+00, double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double 0.000000e+00 ; CHECK-NEXT: ret double [[TMP6]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 @@ -1275,9 +1259,8 @@ define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP3]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[C]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -1305,9 +1288,8 @@ define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP3]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP3]] ; CHECK-NEXT: ret float [[TMP6]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 @@ -1352,9 +1334,8 @@ define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP3]], double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[C]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -1378,9 +1359,8 @@ define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP3]], double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP3]] ; CHECK-NEXT: ret double [[TMP6]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 @@ -1423,9 +1403,8 @@ define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x fl ; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[DOTNOT]], float [[TMP6]], float [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[C]], float [[TMP8]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP9]] ; @@ -1457,9 +1436,8 @@ define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[DOTNOT]], float [[TMP6]], float [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] ; CHECK-NEXT: ret float [[TMP8]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 @@ -1532,9 +1510,8 @@ define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x ; CHECK-NEXT: [[TMP4:%.*]] = fneg double [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[C]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[DOTNOT]], double [[TMP6]], double [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP5]], double [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[C]], double [[TMP8]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP9]] ; @@ -1562,9 +1539,8 @@ define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP4:%.*]] = fneg double [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[C]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[DOTNOT]], double [[TMP6]], double [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP5]], double [[TMP6]] ; CHECK-NEXT: ret double [[TMP8]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 @@ -1632,9 +1608,8 @@ define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x f ; CHECK-NEXT: [[TMP5:%.*]] = fneg float [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP3]], float [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[C]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT]], float [[TMP7]], float [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[C]], float [[TMP9]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP10]] ; @@ -1668,9 +1643,8 @@ define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float ; CHECK-NEXT: [[TMP5:%.*]] = fneg float [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP3]], float [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[C]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT]], float [[TMP7]], float [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]] ; CHECK-NEXT: ret float [[TMP9]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 @@ -1747,9 +1721,8 @@ define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 ; CHECK-NEXT: [[TMP5:%.*]] = fneg double [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[TMP3]], double [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[C]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT]], double [[TMP7]], double [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP6]], double [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[C]], double [[TMP9]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP10]] ; @@ -1779,9 +1752,8 @@ define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x do ; CHECK-NEXT: [[TMP5:%.*]] = fneg double [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[TMP3]], double [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[C]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT]], double [[TMP7]], double [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP6]], double [[TMP7]] ; CHECK-NEXT: ret double [[TMP9]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 diff --git a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll index c10c922f66432..906e84b607481 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -39,10 +39,9 @@ define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP5]], float [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -117,10 +116,9 @@ define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP5]], double [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -191,10 +189,9 @@ define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP5]], float [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -269,10 +266,9 @@ define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP5]], double [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -343,10 +339,9 @@ define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP5]], float [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -421,10 +416,9 @@ define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP5]], double [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -495,10 +489,9 @@ define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP5]], float [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -573,10 +566,9 @@ define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP4]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP5]], double [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -981,9 +973,8 @@ define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x flo ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP1]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -1011,9 +1002,8 @@ define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP1]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP1]] ; CHECK-NEXT: ret float [[TMP6]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 @@ -1060,9 +1050,8 @@ define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP1]], double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP1]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -1086,9 +1075,8 @@ define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x doub ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP1]], double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP1]] ; CHECK-NEXT: ret double [[TMP6]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 @@ -1129,9 +1117,8 @@ define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float 0.000000e+00, float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float 0.000000e+00 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -1159,9 +1146,8 @@ define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float 0.000000e+00, float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float 0.000000e+00 ; CHECK-NEXT: ret float [[TMP6]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 @@ -1206,9 +1192,8 @@ define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double 0.000000e+00, double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double 0.000000e+00 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -1232,9 +1217,8 @@ define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double 0.000000e+00, double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double 0.000000e+00 ; CHECK-NEXT: ret double [[TMP6]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1 @@ -1275,9 +1259,8 @@ define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x fl ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP3]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[C]], float [[TMP6]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP7]] ; @@ -1305,9 +1288,8 @@ define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], float [[TMP3]], float [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP3]] ; CHECK-NEXT: ret float [[TMP6]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 @@ -1352,9 +1334,8 @@ define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP3]], double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[C]], double [[TMP6]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP7]] ; @@ -1378,9 +1359,8 @@ define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[DOTNOT]], double [[TMP3]], double [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP3]] ; CHECK-NEXT: ret double [[TMP6]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 @@ -1423,9 +1403,8 @@ define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x fl ; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[DOTNOT]], float [[TMP6]], float [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[C]], float [[TMP8]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP9]] ; @@ -1457,9 +1436,8 @@ define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> ; CHECK-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[C]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[DOTNOT]], float [[TMP6]], float [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]] ; CHECK-NEXT: ret float [[TMP8]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 @@ -1532,9 +1510,8 @@ define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x ; CHECK-NEXT: [[TMP4:%.*]] = fneg double [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[C]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[DOTNOT]], double [[TMP6]], double [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP5]], double [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[C]], double [[TMP8]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP9]] ; @@ -1562,9 +1539,8 @@ define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x dou ; CHECK-NEXT: [[TMP4:%.*]] = fneg double [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[C]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP7]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[DOTNOT]], double [[TMP6]], double [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP5]], double [[TMP6]] ; CHECK-NEXT: ret double [[TMP8]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 @@ -1632,9 +1608,8 @@ define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x f ; CHECK-NEXT: [[TMP5:%.*]] = fneg float [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP3]], float [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[C]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT]], float [[TMP7]], float [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[C]], float [[TMP9]], i64 0 ; CHECK-NEXT: ret <4 x float> [[TMP10]] ; @@ -1668,9 +1643,8 @@ define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float ; CHECK-NEXT: [[TMP5:%.*]] = fneg float [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP3]], float [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[C]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT]], float [[TMP7]], float [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]] ; CHECK-NEXT: ret float [[TMP9]] ; %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1 @@ -1747,9 +1721,8 @@ define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 ; CHECK-NEXT: [[TMP5:%.*]] = fneg double [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[TMP3]], double [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[C]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT]], double [[TMP7]], double [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP6]], double [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[C]], double [[TMP9]], i64 0 ; CHECK-NEXT: ret <2 x double> [[TMP10]] ; @@ -1779,9 +1752,8 @@ define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x do ; CHECK-NEXT: [[TMP5:%.*]] = fneg double [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[TMP3]], double [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[C]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = and i8 [[MASK:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[DOTNOT]], double [[TMP7]], double [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1 +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP6]], double [[TMP7]] ; CHECK-NEXT: ret double [[TMP9]] ; %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1 diff --git a/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll b/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll index 2d72a4ff8c0df..e2346987737a2 100644 --- a/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll +++ b/llvm/test/Transforms/InstCombine/apint-shl-trunc.ll @@ -3,9 +3,8 @@ define i1 @test0(i39 %X, i39 %A) { ; CHECK-LABEL: @test0( -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i39 1, [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i39 [[TMP1]], [[X:%.*]] -; CHECK-NEXT: [[D:%.*]] = icmp ne i39 [[TMP2]], 0 +; CHECK-NEXT: [[B:%.*]] = lshr i39 [[X:%.*]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = trunc i39 [[B]] to i1 ; CHECK-NEXT: ret i1 [[D]] ; %B = lshr i39 %X, %A @@ -15,9 +14,8 @@ define i1 @test0(i39 %X, i39 %A) { define i1 @test1(i799 %X, i799 %A) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i799 1, [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i799 [[TMP1]], [[X:%.*]] -; CHECK-NEXT: [[D:%.*]] = icmp ne i799 [[TMP2]], 0 +; CHECK-NEXT: [[B:%.*]] = lshr i799 [[X:%.*]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = trunc i799 [[B]] to i1 ; CHECK-NEXT: ret i1 [[D]] ; %B = lshr i799 %X, %A diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll index 85433a99f2cae..97554e9462043 100644 --- a/llvm/test/Transforms/InstCombine/cast.ll +++ b/llvm/test/Transforms/InstCombine/cast.ll @@ -1399,8 +1399,7 @@ define float @sitofp_zext(i16 %a) { define i1 @PR23309(i32 %A, i32 %B) { ; ALL-LABEL: @PR23309( ; ALL-NEXT: [[SUB:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; ALL-NEXT: [[TMP1:%.*]] = and i32 [[SUB]], 1 -; ALL-NEXT: [[TRUNC:%.*]] = icmp ne i32 [[TMP1]], 0 +; ALL-NEXT: [[TRUNC:%.*]] = trunc i32 [[SUB]] to i1 ; ALL-NEXT: ret i1 [[TRUNC]] ; %add = add i32 %A, -4 @@ -1412,8 +1411,7 @@ define i1 @PR23309(i32 %A, i32 %B) { define i1 @PR23309v2(i32 %A, i32 %B) { ; ALL-LABEL: @PR23309v2( ; ALL-NEXT: [[SUB:%.*]] = add i32 [[A:%.*]], [[B:%.*]] -; ALL-NEXT: [[TMP1:%.*]] = and i32 [[SUB]], 1 -; ALL-NEXT: [[TRUNC:%.*]] = icmp ne i32 [[TMP1]], 0 +; ALL-NEXT: [[TRUNC:%.*]] = trunc i32 [[SUB]] to i1 ; ALL-NEXT: ret i1 [[TRUNC]] ; %add = add i32 %A, -4 diff --git a/llvm/test/Transforms/InstCombine/catchswitch-phi.ll b/llvm/test/Transforms/InstCombine/catchswitch-phi.ll index 038847609b0f9..cb87ee67a4518 100644 --- a/llvm/test/Transforms/InstCombine/catchswitch-phi.ll +++ b/llvm/test/Transforms/InstCombine/catchswitch-phi.ll @@ -24,11 +24,11 @@ define void @test0(i1 %c1) personality ptr @__gxx_wasm_personality_v0 { ; CHECK: bb1: ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4 ; CHECK-NEXT: invoke void @foo() -; CHECK-NEXT: to label [[BB3:%.*]] unwind label [[BB4:%.*]] +; CHECK-NEXT: to label [[BB3:%.*]] unwind label [[BB4:%.*]] ; CHECK: bb2: ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4 ; CHECK-NEXT: invoke void @foo() -; CHECK-NEXT: to label [[BB3]] unwind label [[BB4]] +; CHECK-NEXT: to label [[BB3]] unwind label [[BB4]] ; CHECK: bb3: ; CHECK-NEXT: unreachable ; CHECK: bb4: @@ -37,7 +37,7 @@ define void @test0(i1 %c1) personality ptr @__gxx_wasm_personality_v0 { ; CHECK: bb5: ; CHECK-NEXT: [[TMP5:%.*]] = catchpad within [[TMP4]] [ptr null] ; CHECK-NEXT: invoke void @foo() [ "funclet"(token [[TMP5]]) ] -; CHECK-NEXT: to label [[BB6:%.*]] unwind label [[BB7]] +; CHECK-NEXT: to label [[BB6:%.*]] unwind label [[BB7]] ; CHECK: bb6: ; CHECK-NEXT: unreachable ; CHECK: bb7: @@ -89,10 +89,10 @@ define void @test1() personality ptr @__gxx_wasm_personality_v0 { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: invoke void @foo() -; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[CATCH_DISPATCH1:%.*]] +; CHECK-NEXT: to label [[INVOKE_CONT:%.*]] unwind label [[CATCH_DISPATCH1:%.*]] ; CHECK: invoke.cont: ; CHECK-NEXT: [[CALL:%.*]] = invoke i32 @baz() -; CHECK-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[CATCH_DISPATCH:%.*]] +; CHECK-NEXT: to label [[INVOKE_CONT1:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: invoke.cont1: ; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[CALL]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] @@ -101,7 +101,7 @@ define void @test1() personality ptr @__gxx_wasm_personality_v0 { ; CHECK: if.end: ; CHECK-NEXT: [[AP_0:%.*]] = phi i8 [ 1, [[IF_THEN]] ], [ 0, [[INVOKE_CONT1]] ] ; CHECK-NEXT: invoke void @foo() -; CHECK-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[CATCH_DISPATCH]] +; CHECK-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[CATCH_DISPATCH]] ; CHECK: invoke.cont2: ; CHECK-NEXT: br label [[TRY_CONT:%.*]] ; CHECK: catch.dispatch: @@ -114,17 +114,16 @@ define void @test1() personality ptr @__gxx_wasm_personality_v0 { ; CHECK-NEXT: catchret from [[TMP1]] to label [[TRY_CONT]] ; CHECK: rethrow: ; CHECK-NEXT: invoke void @llvm.wasm.rethrow() #[[ATTR0:[0-9]+]] [ "funclet"(token [[TMP1]]) ] -; CHECK-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH1]] +; CHECK-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH1]] ; CHECK: catch.dispatch1: ; CHECK-NEXT: [[AP_2:%.*]] = phi i8 [ [[AP_1]], [[CATCH_DISPATCH]] ], [ [[AP_1]], [[RETHROW]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = catchswitch within none [label %catch.start1] unwind to caller ; CHECK: catch.start1: ; CHECK-NEXT: [[TMP3:%.*]] = catchpad within [[TMP2]] [ptr null] -; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[AP_2]], 1 -; CHECK-NEXT: [[TOBOOL1_NOT:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[TOBOOL1_NOT:%.*]] = trunc i8 [[AP_2]] to i1 ; CHECK-NEXT: br i1 [[TOBOOL1_NOT]], label [[IF_END1:%.*]], label [[IF_THEN1:%.*]] ; CHECK: if.then1: -; CHECK-NEXT: br label [[IF_END1]] +; CHECK-NEXT: br label [[IF_THEN1]] ; CHECK: if.end1: ; CHECK-NEXT: catchret from [[TMP3]] to label [[TRY_CONT]] ; CHECK: try.cont: diff --git a/llvm/test/Transforms/InstCombine/freeze.ll b/llvm/test/Transforms/InstCombine/freeze.ll index da59101d5710c..e8105b6287d0c 100644 --- a/llvm/test/Transforms/InstCombine/freeze.ll +++ b/llvm/test/Transforms/InstCombine/freeze.ll @@ -1049,7 +1049,7 @@ exit: define ptr @freeze_load_noundef(ptr %ptr) { ; CHECK-LABEL: @freeze_load_noundef( -; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !noundef !0 +; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !noundef [[META0:![0-9]+]] ; CHECK-NEXT: ret ptr [[P]] ; %p = load ptr, ptr %ptr, !noundef !0 @@ -1059,7 +1059,7 @@ define ptr @freeze_load_noundef(ptr %ptr) { define ptr @freeze_load_dereferenceable(ptr %ptr) { ; CHECK-LABEL: @freeze_load_dereferenceable( -; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !dereferenceable !1 +; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !dereferenceable [[META1:![0-9]+]] ; CHECK-NEXT: ret ptr [[P]] ; %p = load ptr, ptr %ptr, !dereferenceable !1 @@ -1138,6 +1138,17 @@ define i32 @propagate_drop_flags_or(i32 %arg) { ret i32 %v1.fr } +define i32 @propagate_drop_flags_trunc(i64 %arg) { +; CHECK-LABEL: @propagate_drop_flags_trunc( +; CHECK-NEXT: [[ARG_FR:%.*]] = freeze i64 [[ARG:%.*]] +; CHECK-NEXT: [[V1:%.*]] = trunc i64 [[ARG_FR]] to i32 +; CHECK-NEXT: ret i32 [[V1]] +; + %v1 = trunc nsw nuw i64 %arg to i32 + %v1.fr = freeze i32 %v1 + ret i32 %v1.fr +} + !0 = !{} !1 = !{i64 4} !2 = !{i32 0, i32 100} @@ -1145,8 +1156,8 @@ define i32 @propagate_drop_flags_or(i32 %arg) { ; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; CHECK: attributes #[[ATTR1]] = { nounwind } ;. -; CHECK: [[META0:![0-9]+]] = !{} -; CHECK: [[META1:![0-9]+]] = !{i64 4} +; CHECK: [[META0]] = !{} +; CHECK: [[META1]] = !{i64 4} ; CHECK: [[RNG2]] = !{i32 0, i32 100} ; CHECK: [[RNG3]] = !{i32 0, i32 33} ;. diff --git a/llvm/test/Transforms/InstCombine/icmp-mul-and.ll b/llvm/test/Transforms/InstCombine/icmp-mul-and.ll index d5f5641392c0c..7e7f087ca7112 100644 --- a/llvm/test/Transforms/InstCombine/icmp-mul-and.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul-and.ll @@ -267,10 +267,10 @@ define i1 @pr51551_neg1(i32 %x, i32 %y) { define i1 @pr51551_neg2(i32 %x, i32 %y) { ; CHECK-LABEL: @pr51551_neg2( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[Y:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[Y:%.*]] to i1 ; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[X:%.*]], 7 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP2]], 0 +; CHECK-NEXT: [[DOTNOT:%.*]] = xor i1 [[TMP1]], true ; CHECK-NEXT: [[CMP:%.*]] = select i1 [[DOTNOT]], i1 true, i1 [[CMP1]] ; CHECK-NEXT: ret i1 [[CMP]] ; diff --git a/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll b/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll index adf78723b1302..d858c91becb57 100644 --- a/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll @@ -128,12 +128,12 @@ define i1 @PR46561(i1 %a, i1 %x, i1 %y, i8 %z) { ; CHECK-NEXT: br i1 [[A:%.*]], label [[COND_TRUE:%.*]], label [[END:%.*]] ; CHECK: cond.true: ; CHECK-NEXT: [[MULBOOL:%.*]] = and i1 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[Z:%.*]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i8 [[Z:%.*]] to i1 ; CHECK-NEXT: [[TMP2:%.*]] = xor i1 [[MULBOOL]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[TMP2]], true ; CHECK-NEXT: br label [[END]] ; CHECK: end: -; CHECK-NEXT: [[P:%.*]] = phi i1 [ [[TMP2]], [[COND_TRUE]] ], [ false, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[P:%.*]] = phi i1 [ [[TMP3]], [[COND_TRUE]] ], [ false, [[ENTRY:%.*]] ] ; CHECK-NEXT: ret i1 [[P]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll index da7cc2db09781..e940ae3fec163 100644 --- a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll +++ b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll @@ -214,9 +214,8 @@ define i64 @scalar_mul_bit_x0_y0_uses(i64 %x, i64 %y) { define i64 @scalar_mul_bit_x0_y1(i64 %x, i64 %y) { ; CHECK-LABEL: @scalar_mul_bit_x0_y1( ; CHECK-NEXT: [[AND2:%.*]] = and i64 [[Y:%.*]], 2 -; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], 0 -; CHECK-NEXT: [[MUL:%.*]] = select i1 [[DOTNOT]], i64 0, i64 [[AND2]] +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i1 +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[TMP1]], i64 [[AND2]], i64 0 ; CHECK-NEXT: ret i64 [[MUL]] ; %and1 = and i64 %x, 1 @@ -228,9 +227,8 @@ define i64 @scalar_mul_bit_x0_y1(i64 %x, i64 %y) { define i64 @scalar_mul_bit_x0_yC(i64 %x, i64 %y, i64 %c) { ; CHECK-LABEL: @scalar_mul_bit_x0_yC( ; CHECK-NEXT: [[AND2:%.*]] = and i64 [[Y:%.*]], [[C:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[X:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], 0 -; CHECK-NEXT: [[MUL:%.*]] = select i1 [[DOTNOT]], i64 0, i64 [[AND2]] +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i1 +; CHECK-NEXT: [[MUL:%.*]] = select i1 [[TMP1]], i64 [[AND2]], i64 0 ; CHECK-NEXT: ret i64 [[MUL]] ; %and1 = and i64 %x, 1 diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll index a176d16f2cdfb..d4a689c60786e 100644 --- a/llvm/test/Transforms/InstCombine/mul.ll +++ b/llvm/test/Transforms/InstCombine/mul.ll @@ -684,9 +684,8 @@ define <2 x i32> @signbit_mul_vec_commute(<2 x i32> %a, <2 x i32> %b) { define i32 @lowbit_mul(i32 %a, i32 %b) { ; CHECK-LABEL: @lowbit_mul( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 1 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 0 -; CHECK-NEXT: [[E:%.*]] = select i1 [[DOTNOT]], i32 0, i32 [[B:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i1 +; CHECK-NEXT: [[E:%.*]] = select i1 [[TMP1]], i32 [[B:%.*]], i32 0 ; CHECK-NEXT: ret i32 [[E]] ; %d = and i32 %a, 1 diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll index e1ae6c1ea4757..7eb508ebb5537 100644 --- a/llvm/test/Transforms/InstCombine/phi.ll +++ b/llvm/test/Transforms/InstCombine/phi.ll @@ -116,8 +116,8 @@ define i32 @test6(i16 %A, i1 %b) { ; CHECK: BB1: ; CHECK-NEXT: br label [[BB2]] ; CHECK: BB2: -; CHECK-NEXT: [[B:%.*]] = zext i16 [[A:%.*]] to i32 -; CHECK-NEXT: ret i32 [[B]] +; CHECK-NEXT: [[C:%.*]] = zext i16 [[A:%.*]] to i32 +; CHECK-NEXT: ret i32 [[C]] ; BB0: %X = zext i16 %A to i32 @@ -129,8 +129,8 @@ BB1: BB2: ;; Suck casts into phi - %B = phi i32 [ %X, %BB0 ], [ %Y, %BB1 ] - ret i32 %B + %c = phi i32 [ %X, %BB0 ], [ %Y, %BB1 ] + ret i32 %c } define i32 @test_dead_cycle(i32 %A, i1 %cond) { @@ -232,8 +232,8 @@ define ptr @test8(ptr %A, i1 %b) { ; CHECK: BB1: ; CHECK-NEXT: br label [[BB2]] ; CHECK: BB2: -; CHECK-NEXT: [[B:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 -; CHECK-NEXT: ret ptr [[B]] +; CHECK-NEXT: [[C:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 +; CHECK-NEXT: ret ptr [[C]] ; BB0: %X = getelementptr inbounds { i32, i32 }, ptr %A, i32 0, i32 1 @@ -245,8 +245,8 @@ BB1: BB2: ;; Suck GEPs into phi - %B = phi ptr [ %X, %BB0 ], [ %Y, %BB1 ] - ret ptr %B + %c = phi ptr [ %X, %BB0 ], [ %Y, %BB1 ] + ret ptr %c } define i32 @test9(ptr %A, ptr %B) { @@ -489,9 +489,8 @@ define i64 @test15b(i64 %A, i1 %b) { ; CHECK-NEXT: [[Y_OFF0:%.*]] = phi i64 [ [[A]], [[ENTRY]] ], [ [[C]], [[ONE]] ] ; CHECK-NEXT: [[Y_OFF64]] = phi i64 [ [[A]], [[ENTRY]] ], [ 0, [[ONE]] ] ; CHECK-NEXT: [[D:%.*]] = call i64 @test15a(i64 [[Y_OFF64]]) -; CHECK-NEXT: [[TMP0:%.*]] = and i64 [[D]], 1 -; CHECK-NEXT: [[D1_NOT:%.*]] = icmp eq i64 [[TMP0]], 0 -; CHECK-NEXT: br i1 [[D1_NOT]], label [[END:%.*]], label [[ONE]] +; CHECK-NEXT: [[D1:%.*]] = trunc i64 [[D]] to i1 +; CHECK-NEXT: br i1 [[D1]], label [[ONE]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret i64 [[Y_OFF0]] ; diff --git a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll index 6f5814e1a2823..69b8f6953d61e 100644 --- a/llvm/test/Transforms/InstCombine/ptr-int-cast.ll +++ b/llvm/test/Transforms/InstCombine/ptr-int-cast.ll @@ -6,8 +6,7 @@ define i1 @test1(ptr %x) nounwind { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[X:%.*]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP0]] to i1 ; CHECK-NEXT: ret i1 [[TMP2]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/reduction-add-sext-zext-i1.ll b/llvm/test/Transforms/InstCombine/reduction-add-sext-zext-i1.ll index bbb8d848be6f4..ad55b506a108b 100644 --- a/llvm/test/Transforms/InstCombine/reduction-add-sext-zext-i1.ll +++ b/llvm/test/Transforms/InstCombine/reduction-add-sext-zext-i1.ll @@ -5,8 +5,7 @@ define i1 @reduce_add_self(<8 x i1> %x) { ; CHECK-LABEL: @reduce_add_self( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.ctpop.i8(i8 [[TMP1]]), !range [[RNG0:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP2]], 1 -; CHECK-NEXT: [[RES:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[RES:%.*]] = trunc i8 [[TMP2]] to i1 ; CHECK-NEXT: ret i1 [[RES]] ; %res = call i1 @llvm.vector.reduce.add.v8i32(<8 x i1> %x) diff --git a/llvm/test/Transforms/InstCombine/reduction-xor-sext-zext-i1.ll b/llvm/test/Transforms/InstCombine/reduction-xor-sext-zext-i1.ll index 97b6f7b6d96cd..84ac9369b5ff0 100644 --- a/llvm/test/Transforms/InstCombine/reduction-xor-sext-zext-i1.ll +++ b/llvm/test/Transforms/InstCombine/reduction-xor-sext-zext-i1.ll @@ -5,8 +5,7 @@ define i1 @reduce_xor_self(<8 x i1> %x) { ; CHECK-LABEL: @reduce_xor_self( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.ctpop.i8(i8 [[TMP1]]), !range [[RNG0:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP2]], 1 -; CHECK-NEXT: [[RES:%.*]] = icmp ne i8 [[TMP3]], 0 +; CHECK-NEXT: [[RES:%.*]] = trunc i8 [[TMP2]] to i1 ; CHECK-NEXT: ret i1 [[RES]] ; %res = call i1 @llvm.vector.reduce.xor.v8i32(<8 x i1> %x) @@ -17,9 +16,8 @@ define i32 @reduce_xor_sext(<4 x i1> %x) { ; CHECK-LABEL: @reduce_xor_sext( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4 ; CHECK-NEXT: [[TMP2:%.*]] = call i4 @llvm.ctpop.i4(i4 [[TMP1]]), !range [[RNG1:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = and i4 [[TMP2]], 1 -; CHECK-NEXT: [[SEXT:%.*]] = sub nsw i4 0, [[TMP3]] -; CHECK-NEXT: [[RES:%.*]] = sext i4 [[SEXT]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i4 [[TMP2]] to i1 +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP3]] to i32 ; CHECK-NEXT: ret i32 [[RES]] ; %sext = sext <4 x i1> %x to <4 x i32> @@ -57,9 +55,8 @@ define i8 @reduce_xor_zext_long(<128 x i1> %x) { ; CHECK-LABEL: @reduce_xor_zext_long( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i1> [[X:%.*]] to i128 ; CHECK-NEXT: [[TMP2:%.*]] = call i128 @llvm.ctpop.i128(i128 [[TMP1]]), !range [[RNG3:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[TMP2]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP3]], 1 -; CHECK-NEXT: [[RES:%.*]] = sub nsw i8 0, [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[TMP2]] to i1 +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP3]] to i8 ; CHECK-NEXT: ret i8 [[RES]] ; %sext = sext <128 x i1> %x to <128 x i8> @@ -72,9 +69,8 @@ define i8 @reduce_xor_zext_long_external_use(<128 x i1> %x) { ; CHECK-LABEL: @reduce_xor_zext_long_external_use( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i1> [[X:%.*]] to i128 ; CHECK-NEXT: [[TMP2:%.*]] = call i128 @llvm.ctpop.i128(i128 [[TMP1]]), !range [[RNG3]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[TMP2]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP3]], 1 -; CHECK-NEXT: [[RES:%.*]] = sub nsw i8 0, [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i128 [[TMP2]] to i1 +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP3]] to i8 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <128 x i1> [[X]], i64 0 ; CHECK-NEXT: [[EXT:%.*]] = sext i1 [[TMP5]] to i8 ; CHECK-NEXT: store i8 [[EXT]], ptr @glob, align 1 diff --git a/llvm/test/Transforms/InstCombine/trunc.ll b/llvm/test/Transforms/InstCombine/trunc.ll index c6bc06d666d0a..760825d6b1da0 100644 --- a/llvm/test/Transforms/InstCombine/trunc.ll +++ b/llvm/test/Transforms/InstCombine/trunc.ll @@ -1021,3 +1021,40 @@ define i16 @PR44545(i32 %t0, i32 %data) { %sub = add nsw i16 %cast, -1 ret i16 %sub } + +; Make sure that SimplifyDemandedBits drops the nowrap flags +define i8 @drop_nsw_trunc(i16 %x, i16 %y) { +; CHECK-LABEL: @drop_nsw_trunc( +; CHECK-NEXT: [[AND2:%.*]] = and i16 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[RES:%.*]] = trunc i16 [[AND2]] to i8 +; CHECK-NEXT: ret i8 [[RES]] +; + %and = and i16 %x, 255 + %and2 = and i16 %and, %y + %res = trunc nsw i16 %and2 to i8 + ret i8 %res +} + +define i8 @drop_nuw_trunc(i16 %x, i16 %y) { +; CHECK-LABEL: @drop_nuw_trunc( +; CHECK-NEXT: [[AND2:%.*]] = and i16 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[B:%.*]] = trunc i16 [[AND2]] to i8 +; CHECK-NEXT: ret i8 [[B]] +; + %and = and i16 %x, 255 + %and2 = and i16 %and, %y + %res = trunc nuw i16 %and2 to i8 + ret i8 %res +} + +define i8 @drop_both_trunc(i16 %x, i16 %y) { +; CHECK-LABEL: @drop_both_trunc( +; CHECK-NEXT: [[AND2:%.*]] = and i16 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[RES:%.*]] = trunc i16 [[AND2]] to i8 +; CHECK-NEXT: ret i8 [[RES]] +; + %and = and i16 %x, 255 + %and2 = and i16 %and, %y + %res = trunc nuw nsw i16 %and2 to i8 + ret i8 %res +} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll index 8f1c52c591631..708cdc9ca45ec 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/merge-functions.ll @@ -14,8 +14,7 @@ define i1 @test1(i32 %c) { ; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[SWITCH_TABLEIDX]], 20 ; CHECK-NEXT: [[SWITCH_CAST:%.*]] = trunc i32 [[SWITCH_TABLEIDX]] to i20 ; CHECK-NEXT: [[SWITCH_DOWNSHIFT:%.*]] = lshr i20 -490991, [[SWITCH_CAST]] -; CHECK-NEXT: [[TMP1:%.*]] = and i20 [[SWITCH_DOWNSHIFT]], 1 -; CHECK-NEXT: [[SWITCH_MASKED:%.*]] = icmp ne i20 [[TMP1]], 0 +; CHECK-NEXT: [[SWITCH_MASKED:%.*]] = trunc i20 [[SWITCH_DOWNSHIFT]] to i1 ; CHECK-NEXT: [[I_0:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false ; CHECK-NEXT: ret i1 [[I_0]] ; diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll index e00b737cae4e8..4881937df101a 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-callee-profile-mismatch.ll @@ -1,6 +1,6 @@ ; REQUIRES: x86_64-linux ; REQUIRES: asserts -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-impl -pass-remarks=inline 2>&1 | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-callee-profile-mismatch.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl -pass-remarks=inline 2>&1 | FileCheck %s ; CHECK: Run stale profile matching for bar diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-lto.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-lto.ll index 270beee4ebc2b..7aabeeca2585b 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-lto.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching-lto.ll @@ -1,6 +1,6 @@ ; REQUIRES: x86_64-linux ; REQUIRES: asserts -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching-lto.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-impl 2>&1 | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching-lto.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s ; CHECK: Run stale profile matching for main diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll index 29877fb22a2c2..0d471e43d2a72 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-stale-profile-matching.ll @@ -1,6 +1,6 @@ ; REQUIRES: x86_64-linux ; REQUIRES: asserts -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-impl 2>&1 | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-stale-profile-matching.prof --salvage-stale-profile -S --debug-only=sample-profile,sample-profile-matcher,sample-profile-impl 2>&1 | FileCheck %s ; The profiled source code: diff --git a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll index a081eddfc4566..4a4c94098ab94 100644 --- a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll +++ b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll @@ -64,8 +64,8 @@ define float @PR39535min_switch(i64 %i, float %x) { ; CHECK-LABEL: @PR39535min_switch( ; CHECK-NEXT: entry: ; CHECK-NEXT: switch i64 [[I:%.*]], label [[END:%.*]] [ -; CHECK-NEXT: i64 1, label [[BB1:%.*]] -; CHECK-NEXT: i64 2, label [[BB2:%.*]] +; CHECK-NEXT: i64 1, label [[BB1:%.*]] +; CHECK-NEXT: i64 2, label [[BB2:%.*]] ; CHECK-NEXT: ] ; CHECK: bb1: ; CHECK-NEXT: br label [[END]] @@ -154,3 +154,33 @@ F: %z2 = or disjoint i32 %x, %y ret i32 %z2 } + +define i16 @hoist_trunc_flags_preserve(i1 %C, i32 %x) { +; CHECK-LABEL: @hoist_trunc_flags_preserve( +; CHECK-NEXT: common.ret: +; CHECK-NEXT: [[Z1:%.*]] = trunc nuw nsw i32 [[X:%.*]] to i16 +; CHECK-NEXT: ret i16 [[Z1]] +; + br i1 %C, label %T, label %F +T: + %z1 = trunc nsw nuw i32 %x to i16 + ret i16 %z1 +F: + %z2 = trunc nsw nuw i32 %x to i16 + ret i16 %z2 +} + +define i16 @hoist_trunc_flags_drop(i1 %C, i32 %x) { +; CHECK-LABEL: @hoist_trunc_flags_drop( +; CHECK-NEXT: common.ret: +; CHECK-NEXT: [[Z1:%.*]] = trunc i32 [[X:%.*]] to i16 +; CHECK-NEXT: ret i16 [[Z1]] +; + br i1 %C, label %T, label %F +T: + %z1 = trunc i32 %x to i16 + ret i16 %z1 +F: + %z2 = trunc nsw nuw i32 %x to i16 + ret i16 %z2 +} diff --git a/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp b/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp index 08857de3cf4e4..822707a1f4ed3 100644 --- a/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp @@ -275,4 +275,174 @@ TEST_F(AArch64GISelMITest, TestConstantFoldCTT) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } +TEST_F(AArch64GISelMITest, TestConstantFoldICMP) { + setUp(); + if (!TM) + GTEST_SKIP(); + + LLT s32 = LLT::scalar(32); + LLT s1 = LLT::scalar(1); + + GISelCSEInfo CSEInfo; + CSEInfo.setCSEConfig(std::make_unique()); + CSEInfo.analyze(*MF); + B.setCSEInfo(&CSEInfo); + CSEMIRBuilder CSEB(B.getState()); + + auto One = CSEB.buildConstant(s32, 1); + auto Two = CSEB.buildConstant(s32, 2); + auto MinusOne = CSEB.buildConstant(s32, -1); + auto MinusTwo = CSEB.buildConstant(s32, -2); + + // ICMP_EQ + { + auto I = CSEB.buildICmp(CmpInst::Predicate::ICMP_EQ, s1, One, One); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_NE + { + auto I = CSEB.buildICmp(CmpInst::Predicate::ICMP_NE, s1, One, Two); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_UGT + { + auto I = CSEB.buildICmp(CmpInst::Predicate::ICMP_UGT, s1, Two, One); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_UGE + { + auto I = CSEB.buildICmp(CmpInst::Predicate::ICMP_UGE, s1, One, One); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_ULT + { + auto I = CSEB.buildICmp(CmpInst::Predicate::ICMP_ULT, s1, One, Two); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_ULE + { + auto I = CSEB.buildICmp(CmpInst::Predicate::ICMP_ULE, s1, Two, Two); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_SGT + { + auto I = + CSEB.buildICmp(CmpInst::Predicate::ICMP_SGT, s1, MinusOne, MinusTwo); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_SGE + { + auto I = + CSEB.buildICmp(CmpInst::Predicate::ICMP_SGE, s1, MinusOne, MinusOne); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_SLT + { + auto I = + CSEB.buildICmp(CmpInst::Predicate::ICMP_SLT, s1, MinusTwo, MinusOne); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + // ICMP_SLE + { + auto I = + CSEB.buildICmp(CmpInst::Predicate::ICMP_SLE, s1, MinusTwo, MinusOne); + EXPECT_TRUE(I->getOpcode() == TargetOpcode::G_CONSTANT); + EXPECT_TRUE(I->getOperand(1).getCImm()->getZExtValue()); + } + + LLT VecTy = LLT::fixed_vector(2, s32); + LLT DstTy = LLT::fixed_vector(2, s1); + auto Three = CSEB.buildConstant(s32, 3); + auto MinusThree = CSEB.buildConstant(s32, -3); + auto OneOne = CSEB.buildBuildVector(VecTy, {One.getReg(0), One.getReg(0)}); + auto OneTwo = CSEB.buildBuildVector(VecTy, {One.getReg(0), Two.getReg(0)}); + auto TwoThree = + CSEB.buildBuildVector(VecTy, {Two.getReg(0), Three.getReg(0)}); + auto MinusOneOne = + CSEB.buildBuildVector(VecTy, {MinusOne.getReg(0), MinusOne.getReg(0)}); + auto MinusOneTwo = + CSEB.buildBuildVector(VecTy, {MinusOne.getReg(0), MinusTwo.getReg(0)}); + auto MinusTwoThree = + CSEB.buildBuildVector(VecTy, {MinusTwo.getReg(0), MinusThree.getReg(0)}); + + // ICMP_EQ + CSEB.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, OneOne, OneOne); + + // ICMP_NE + CSEB.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, OneOne, OneTwo); + + // ICMP_UGT + CSEB.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, TwoThree, OneTwo); + + // ICMP_UGE + CSEB.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, OneTwo, OneOne); + + // ICMP_ULT + CSEB.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, OneOne, OneTwo); + + // ICMP_ULE + CSEB.buildICmp(CmpInst::Predicate::ICMP_ULE, DstTy, OneTwo, OneOne); + + // ICMP_SGT + CSEB.buildICmp(CmpInst::Predicate::ICMP_SGT, DstTy, MinusOneTwo, + MinusTwoThree); + + // ICMP_SGE + CSEB.buildICmp(CmpInst::Predicate::ICMP_SGE, DstTy, MinusOneTwo, MinusOneOne); + + // ICMP_SLT + CSEB.buildICmp(CmpInst::Predicate::ICMP_SLT, DstTy, MinusTwoThree, + MinusOneTwo); + + // ICMP_SLE + CSEB.buildICmp(CmpInst::Predicate::ICMP_SLE, DstTy, MinusOneTwo, MinusOneOne); + + auto CheckStr = R"( + ; CHECK: [[One:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[Two:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[MinusOne:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: [[MinusTwo:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2 + ; CHECK: [[True:%[0-9]+]]:_(s1) = G_CONSTANT i1 true + ; CHECK: [[Three:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[MinusThree:%[0-9]+]]:_(s32) = G_CONSTANT i32 -3 + ; CHECK: {{%[0-9]+}}:_(<2 x s32>) = G_BUILD_VECTOR [[One]]:_(s32), [[One]]:_(s32) + ; CHECK: {{%[0-9]+}}:_(<2 x s32>) = G_BUILD_VECTOR [[One]]:_(s32), [[Two]]:_(s32) + ; CHECK: {{%[0-9]+}}:_(<2 x s32>) = G_BUILD_VECTOR [[Two]]:_(s32), [[Three]]:_(s32) + ; CHECK: {{%[0-9]+}}:_(<2 x s32>) = G_BUILD_VECTOR [[MinusOne]]:_(s32), [[MinusOne]]:_(s32) + ; CHECK: {{%[0-9]+}}:_(<2 x s32>) = G_BUILD_VECTOR [[MinusOne]]:_(s32), [[MinusTwo]]:_(s32) + ; CHECK: {{%[0-9]+}}:_(<2 x s32>) = G_BUILD_VECTOR [[MinusTwo]]:_(s32), [[MinusThree]]:_(s32) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[True]]:_(s1), [[True]]:_(s1) + ; CHECK: [[False:%[0-9]+]]:_(s1) = G_CONSTANT i1 false + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[False]]:_(s1), [[True]]:_(s1) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[True]]:_(s1), [[True]]:_(s1) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[True]]:_(s1), [[True]]:_(s1) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[False]]:_(s1), [[True]]:_(s1) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[True]]:_(s1), [[False]]:_(s1) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[True]]:_(s1), [[True]]:_(s1) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[True]]:_(s1), [[False]]:_(s1) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[True]]:_(s1), [[True]]:_(s1) + ; CHECK: {{%[0-9]+}}:_(<2 x s1>) = G_BUILD_VECTOR [[True]]:_(s1), [[True]]:_(s1) + )"; + + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + } // namespace diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index 1ae6efd4a7d0a..ef9e9ff04f85f 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -2459,6 +2459,7 @@ void GICombinerEmitter::emitRunCustomAction(raw_ostream &OS) { OS << " switch(ApplyID) {\n"; for (const auto &Apply : ApplyCode) { OS << " case " << Apply->getEnumNameWithPrefix(CXXApplyPrefix) << ":{\n" + << " Helper.getBuilder().setInstrAndDebugLoc(*State.MIs[0]);\n" << " " << join(split(Apply->Code, '\n'), "\n ") << '\n' << " return;\n"; OS << " }\n"; diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index ac111f9025135..8a2ab18bf953a 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -500,22 +500,27 @@ if (current_toolchain == default_toolchain) { "__fwd/array.h", "__fwd/bit_reference.h", "__fwd/complex.h", + "__fwd/deque.h", "__fwd/format.h", "__fwd/fstream.h", "__fwd/functional.h", "__fwd/ios.h", "__fwd/istream.h", "__fwd/mdspan.h", + "__fwd/memory.h", "__fwd/memory_resource.h", "__fwd/ostream.h", "__fwd/pair.h", + "__fwd/queue.h", "__fwd/span.h", "__fwd/sstream.h", + "__fwd/stack.h", "__fwd/streambuf.h", "__fwd/string.h", "__fwd/string_view.h", "__fwd/subrange.h", "__fwd/tuple.h", + "__fwd/vector.h", "__hash_table", "__ios/fpos.h", "__iterator/access.h", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn index 2003e86e90b96..0d134c7bdffb7 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn @@ -57,6 +57,7 @@ static_library("IPO") { "SCCP.cpp", "SampleContextTracker.cpp", "SampleProfile.cpp", + "SampleProfileMatcher.cpp", "SampleProfileProbe.cpp", "StripDeadPrototypes.cpp", "StripSymbols.cpp", diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h index 773957a8b5116..80e3fec22694f 100644 --- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h +++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h @@ -143,6 +143,12 @@ class SparseTensorStorageBase { MLIR_SPARSETENSOR_FOREVERY_FIXED_O(DECL_GETCOORDINATES) #undef DECL_GETCOORDINATES + /// Gets coordinates-overhead storage buffer for the given level. +#define DECL_GETCOORDINATESBUFFER(INAME, C) \ + virtual void getCoordinatesBuffer(std::vector **, uint64_t); + MLIR_SPARSETENSOR_FOREVERY_FIXED_O(DECL_GETCOORDINATESBUFFER) +#undef DECL_GETCOORDINATESBUFFER + /// Gets primary storage. #define DECL_GETVALUES(VNAME, V) virtual void getValues(std::vector **); MLIR_SPARSETENSOR_FOREVERY_V(DECL_GETVALUES) @@ -251,6 +257,31 @@ class SparseTensorStorage final : public SparseTensorStorageBase { assert(lvl < getLvlRank()); *out = &coordinates[lvl]; } + void getCoordinatesBuffer(std::vector **out, uint64_t lvl) final { + assert(out && "Received nullptr for out parameter"); + assert(lvl < getLvlRank()); + // Note that the sparse tensor support library always stores COO in SoA + // format, even when AoS is requested. This is never an issue, since all + // actual code/library generation requests "views" into the coordinate + // storage for the individual levels, which is trivially provided for + // both AoS and SoA (as well as all the other storage formats). The only + // exception is when the buffer version of coordinate storage is requested + // (currently only for printing). In that case, we do the following + // potentially expensive transformation to provide that view. If this + // operation becomes more common beyond debugging, we should consider + // implementing proper AoS in the support library as well. + uint64_t lvlRank = getLvlRank(); + uint64_t nnz = values.size(); + crdBuffer.clear(); + crdBuffer.reserve(nnz * (lvlRank - lvl)); + for (uint64_t i = 0; i < nnz; i++) { + for (uint64_t l = lvl; l < lvlRank; l++) { + assert(i < coordinates[l].size()); + crdBuffer.push_back(coordinates[l][i]); + } + } + *out = &crdBuffer; + } void getValues(std::vector **out) final { assert(out && "Received nullptr for out parameter"); *out = &values; @@ -529,10 +560,14 @@ class SparseTensorStorage final : public SparseTensorStorageBase { return -1u; } + // Sparse tensor storage components. std::vector> positions; std::vector> coordinates; std::vector values; + + // Auxiliary data structures. std::vector lvlCursor; + std::vector crdBuffer; // just for AoS view }; //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h index d916186c835c2..396f76fd8f921 100644 --- a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h +++ b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h @@ -77,6 +77,14 @@ MLIR_SPARSETENSOR_FOREVERY_O(DECL_SPARSEPOSITIONS) MLIR_SPARSETENSOR_FOREVERY_O(DECL_SPARSECOORDINATES) #undef DECL_SPARSECOORDINATES +/// Tensor-storage method to obtain direct access to the coordinates array +/// buffer for the given level (provides an AoS view into the library). +#define DECL_SPARSECOORDINATES(CNAME, C) \ + MLIR_CRUNNERUTILS_EXPORT void _mlir_ciface_sparseCoordinatesBuffer##CNAME( \ + StridedMemRefType *out, void *tensor, index_type lvl); +MLIR_SPARSETENSOR_FOREVERY_O(DECL_SPARSECOORDINATES) +#undef DECL_SPARSECOORDINATES + /// Tensor-storage method to insert elements in lexicographical /// level-coordinate order. #define DECL_LEXINSERT(VNAME, V) \ diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp index 92c98b34af602..c52fa3751e6b4 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp @@ -275,7 +275,7 @@ static Value genPositionsCall(OpBuilder &builder, Location loc, .getResult(0); } -/// Generates a call to obtain the coordindates array. +/// Generates a call to obtain the coordinates array. static Value genCoordinatesCall(OpBuilder &builder, Location loc, SparseTensorType stt, Value ptr, Level l) { Type crdTp = stt.getCrdType(); @@ -287,6 +287,20 @@ static Value genCoordinatesCall(OpBuilder &builder, Location loc, .getResult(0); } +/// Generates a call to obtain the coordinates array (AoS view). +static Value genCoordinatesBufferCall(OpBuilder &builder, Location loc, + SparseTensorType stt, Value ptr, + Level l) { + Type crdTp = stt.getCrdType(); + auto resTp = MemRefType::get({ShapedType::kDynamic}, crdTp); + Value lvl = constantIndex(builder, loc, l); + SmallString<25> name{"sparseCoordinatesBuffer", + overheadTypeFunctionSuffix(crdTp)}; + return createFuncCall(builder, loc, name, resTp, {ptr, lvl}, + EmitCInterface::On) + .getResult(0); +} + //===----------------------------------------------------------------------===// // Conversion rules. //===----------------------------------------------------------------------===// @@ -518,13 +532,35 @@ class SparseTensorToCoordinatesConverter LogicalResult matchAndRewrite(ToCoordinatesOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + const Location loc = op.getLoc(); + auto stt = getSparseTensorType(op.getTensor()); + auto crds = genCoordinatesCall(rewriter, loc, stt, adaptor.getTensor(), + op.getLevel()); + // Cast the MemRef type to the type expected by the users, though these + // two types should be compatible at runtime. + if (op.getType() != crds.getType()) + crds = rewriter.create(loc, op.getType(), crds); + rewriter.replaceOp(op, crds); + return success(); + } +}; + +/// Sparse conversion rule for coordinate accesses (AoS style). +class SparseToCoordinatesBufferConverter + : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(ToCoordinatesBufferOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + const Location loc = op.getLoc(); auto stt = getSparseTensorType(op.getTensor()); - auto crds = genCoordinatesCall(rewriter, op.getLoc(), stt, - adaptor.getTensor(), op.getLevel()); + auto crds = genCoordinatesBufferCall( + rewriter, loc, stt, adaptor.getTensor(), stt.getAoSCOOStart()); // Cast the MemRef type to the type expected by the users, though these // two types should be compatible at runtime. if (op.getType() != crds.getType()) - crds = rewriter.create(op.getLoc(), op.getType(), crds); + crds = rewriter.create(loc, op.getType(), crds); rewriter.replaceOp(op, crds); return success(); } @@ -878,10 +914,10 @@ void mlir::populateSparseTensorConversionPatterns(TypeConverter &typeConverter, SparseTensorAllocConverter, SparseTensorEmptyConverter, SparseTensorDeallocConverter, SparseTensorReorderCOOConverter, SparseTensorToPositionsConverter, SparseTensorToCoordinatesConverter, - SparseTensorToValuesConverter, SparseNumberOfEntriesConverter, - SparseTensorLoadConverter, SparseTensorInsertConverter, - SparseTensorExpandConverter, SparseTensorCompressConverter, - SparseTensorAssembleConverter, SparseTensorDisassembleConverter, - SparseHasRuntimeLibraryConverter>(typeConverter, - patterns.getContext()); + SparseToCoordinatesBufferConverter, SparseTensorToValuesConverter, + SparseNumberOfEntriesConverter, SparseTensorLoadConverter, + SparseTensorInsertConverter, SparseTensorExpandConverter, + SparseTensorCompressConverter, SparseTensorAssembleConverter, + SparseTensorDisassembleConverter, SparseHasRuntimeLibraryConverter>( + typeConverter, patterns.getContext()); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp index 17f70d0796ccf..b117c1694e45b 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp @@ -648,7 +648,9 @@ struct PrintRewriter : public OpRewritePattern { loc, lvl, vector::PrintPunctuation::NoPunctuation); rewriter.create(loc, rewriter.getStringAttr("] : ")); Value crd = nullptr; - // TODO: eliminates ToCoordinateBufferOp! + // For COO AoS storage, we want to print a single, linear view of + // the full coordinate storage at this level. For any other storage, + // we show the coordinate storage for every indivual level. if (stt.getAoSCOOStart() == l) crd = rewriter.create(loc, tensor); else diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp index 6f6b6dcdad200..69c497264fd1e 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp @@ -1643,10 +1643,12 @@ struct DropUnitDimFromElementwiseOps final if (!resultVectorType) return failure(); - // Check the pre-conditions. For `Elementwise` Ops all operands are - // guaranteed to have identical shapes and it suffices to only check the - // first one. - auto sourceVectorType = cast(op->getOperands()[0].getType()); + // Check the operand pre-conditions. For `Elementwise` ops all operands are + // guaranteed to have identical shapes (with some exceptions such as + // `arith.select`) and it suffices to only check one of them. + auto sourceVectorType = dyn_cast(op->getOperand(0).getType()); + if (!sourceVectorType) + return failure(); if (sourceVectorType.getRank() < 2) return failure(); diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index b7e448d5417ea..a091944b9ee7d 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -97,6 +97,29 @@ add_mlir_library(MLIRExecutionEngine MLIRTargetLLVMIRExport ) +if(LLVM_BUILD_LLVM_DYLIB) + # Build a shared library for the execution engine. Some downstream projects + # use this library to build their own CPU runners while preserving dynamic + # linkage. + add_mlir_library(MLIRExecutionEngineShared + ExecutionEngine.cpp + SHARED + + EXCLUDE_FROM_LIBMLIR + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/ExecutionEngine + + # Ensures that all necessary dependencies are resolved. + DEPENDS + MLIRExecutionEngine + + LINK_LIBS PUBLIC + LLVM + MLIR + ) +endif() + get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_mlir_library(MLIRJitRunner JitRunner.cpp diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp index aaa42a7e3a31b..acb2d1bb5bed6 100644 --- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp +++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp @@ -68,6 +68,14 @@ MLIR_SPARSETENSOR_FOREVERY_FIXED_O(IMPL_GETPOSITIONS) MLIR_SPARSETENSOR_FOREVERY_FIXED_O(IMPL_GETCOORDINATES) #undef IMPL_GETCOORDINATES +#define IMPL_GETCOORDINATESBUFFER(CNAME, C) \ + void SparseTensorStorageBase::getCoordinatesBuffer(std::vector **, \ + uint64_t) { \ + FATAL_PIV("getCoordinatesBuffer" #CNAME); \ + } +MLIR_SPARSETENSOR_FOREVERY_FIXED_O(IMPL_GETCOORDINATESBUFFER) +#undef IMPL_GETCOORDINATESBUFFER + #define IMPL_GETVALUES(VNAME, V) \ void SparseTensorStorageBase::getValues(std::vector **) { \ FATAL_PIV("getValues" #VNAME); \ diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp index 8835056099d23..f160b0f40fb0a 100644 --- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp +++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp @@ -311,6 +311,7 @@ MLIR_SPARSETENSOR_FOREVERY_V(IMPL_SPARSEVALUES) assert(v); \ aliasIntoMemref(v->size(), v->data(), *ref); \ } + #define IMPL_SPARSEPOSITIONS(PNAME, P) \ IMPL_GETOVERHEAD(sparsePositions##PNAME, P, getPositions) MLIR_SPARSETENSOR_FOREVERY_O(IMPL_SPARSEPOSITIONS) @@ -320,6 +321,12 @@ MLIR_SPARSETENSOR_FOREVERY_O(IMPL_SPARSEPOSITIONS) IMPL_GETOVERHEAD(sparseCoordinates##CNAME, C, getCoordinates) MLIR_SPARSETENSOR_FOREVERY_O(IMPL_SPARSECOORDINATES) #undef IMPL_SPARSECOORDINATES + +#define IMPL_SPARSECOORDINATESBUFFER(CNAME, C) \ + IMPL_GETOVERHEAD(sparseCoordinatesBuffer##CNAME, C, getCoordinatesBuffer) +MLIR_SPARSETENSOR_FOREVERY_O(IMPL_SPARSECOORDINATESBUFFER) +#undef IMPL_SPARSECOORDINATESBUFFER + #undef IMPL_GETOVERHEAD #define IMPL_LEXINSERT(VNAME, V) \ diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 2ec0b964b304f..3c72c8789e8ec 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -279,11 +279,13 @@ class CreateBlockRewrite : public BlockRewrite { auto &blockOps = block->getOperations(); while (!blockOps.empty()) blockOps.remove(blockOps.begin()); - block->dropAllUses(); - if (block->getParent()) + block->dropAllDefinedValueUses(); + if (block->getParent()) { block->erase(); - else + } else { + block->dropAllDefinedValueUses(); delete block; + } } }; diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp index 6cb5635e68c92..bbecbdb856693 100644 --- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp +++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp @@ -377,7 +377,7 @@ class GreedyPatternRewriteDriver : public PatternRewriter, /// be re-added to the worklist. This function should be called when an /// operation is modified or removed, as it may trigger further /// simplifications. - void addOperandsToWorklist(ValueRange operands); + void addOperandsToWorklist(Operation *op); /// Notify the driver that the given block was inserted. void notifyBlockInserted(Block *block, Region *previous, @@ -688,17 +688,36 @@ void GreedyPatternRewriteDriver::notifyOperationModified(Operation *op) { addToWorklist(op); } -void GreedyPatternRewriteDriver::addOperandsToWorklist(ValueRange operands) { - for (Value operand : operands) { - // If the use count of this operand is now < 2, we re-add the defining - // operation to the worklist. - // TODO: This is based on the fact that zero use operations - // may be deleted, and that single use values often have more - // canonicalization opportunities. - if (!operand || (!operand.use_empty() && !operand.hasOneUse())) +void GreedyPatternRewriteDriver::addOperandsToWorklist(Operation *op) { + for (Value operand : op->getOperands()) { + // If this operand currently has at most 2 users, add its defining op to the + // worklist. Indeed, after the op is deleted, then the operand will have at + // most 1 user left. If it has 0 users left, it can be deleted too, + // and if it has 1 user left, there may be further canonicalization + // opportunities. + if (!operand) continue; - if (auto *defOp = operand.getDefiningOp()) - addToWorklist(defOp); + + auto *defOp = operand.getDefiningOp(); + if (!defOp) + continue; + + Operation *otherUser = nullptr; + bool hasMoreThanTwoUses = false; + for (auto user : operand.getUsers()) { + if (user == op || user == otherUser) + continue; + if (!otherUser) { + otherUser = user; + continue; + } + hasMoreThanTwoUses = true; + break; + } + if (hasMoreThanTwoUses) + continue; + + addToWorklist(defOp); } } @@ -722,7 +741,7 @@ void GreedyPatternRewriteDriver::notifyOperationErased(Operation *op) { if (config.listener) config.listener->notifyOperationErased(op); - addOperandsToWorklist(op->getOperands()); + addOperandsToWorklist(op); worklist.remove(op); if (config.strictMode != GreedyRewriteStrictness::AnyOp) diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir index 17eec59369186..6494e1b271948 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-invalid.mlir @@ -15,3 +15,16 @@ func.func @tensor_with_unknown_rank(%arg0: tensor<*xi8>) -> tensor<*xi8> { %0 = "tosa.abs"(%arg0) : (tensor<*xi8>) -> tensor<*xi8> return %0 : tensor<*xi8> } + +// ----- + +// CHECK-LABEL: @unranked_add +func.func @unranked_add(%arg0 : tensor<10x10xf32> , %arg1 : tensor<10x10xf32>, %arg2 : tensor<*xf32>) -> (tensor<10x10xf32>) { + // expected-error@+3 {{failed to legalize operation 'tosa.add'}} + %reduce = tosa.reduce_max %arg0 {axis = 1 : i32} : (tensor<10x10xf32>) -> tensor<10x1xf32> + %1 = tosa.add %reduce, %arg1 : (tensor<10x1xf32>, tensor<10x10xf32>) -> tensor<10x10xf32> + %0 = tosa.add %1, %arg2 : (tensor<10x10xf32>, tensor<*xf32>) -> tensor<*xf32> + %2 = tosa.reshape %0 {new_shape = array} : (tensor<*xf32>) -> tensor<10x10xf32> + return %2 : tensor<10x10xf32> +} + diff --git a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir index 4ba51c5953d13..3a120a56056ca 100644 --- a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir @@ -40,7 +40,7 @@ func.func @cast_away_contraction_leading_one_dims(%arg0: vector<1x16x8xf32>, %ar // CHECK: %[[R1:.*]] = vector.extract %{{.*}}[0] : vector<8x16xf32> from vector<1x8x16xf32> // CHECK: %[[R2:.*]] = vector.extract %{{.*}}[0] : vector<16x16xf32> from vector<1x16x16xf32> // CHECK: %[[CONTRACT:.*]] = vector.mask %[[MASK]] { -// CHECK-SAME: vector.contract {indexing_maps = [#[[$MAP_0]], #[[$MAP_1]], #[[$MAP_2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} +// CHECK-SAME: vector.contract {indexing_maps = [#[[$MAP_0]], #[[$MAP_1]], #[[$MAP_2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} // CHECK-SAME: %[[R0]], %[[R1]], %[[R2]] : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32> // CHECK-SAME: } : vector<16x16x8xi1> -> vector<16x16xf32> // CHECK: %[[RES:.*]] = vector.broadcast %[[CONTRACT]] : vector<16x16xf32> to vector<1x16x16xf32> @@ -76,7 +76,7 @@ func.func @cast_away_contraction_leading_one_dim_under_const_mask(%arg0: vector< // CHECK: %[[M:.*]] = vector.extract %{{.*}} : vector<16x16x8xi1> from vector<1x16x16x8xi1> // CHECK: %[[CONTRACT:.*]] = vector.mask %[[M]] { // CHECK-SAME: vector.contract {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} -// CHECK-SAME: %[[R0]], %[[R1]], %[[R2]] : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32> +// CHECK-SAME: %[[R0]], %[[R1]], %[[R2]] : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32> // CHECK-SAME: } : vector<16x16x8xi1> -> vector<16x16xf32> // CHECK-NEXT: %[[RES:.*]] = vector.broadcast %[[CONTRACT]] : vector<16x16xf32> to vector<1x16x16xf32> // CHECK-NEXT: return %[[RES]] : vector<1x16x16xf32> @@ -472,6 +472,8 @@ func.func @cast_away_elementwise_leading_one_dims( return %0, %1, %2, %3: vector<1x1x8xf32>, vector<1x4xi1>, vector<1x4xf32>, vector<1x4xf32> } +// ----- + // CHECK-LABEL: func @cast_away_insert_leading_one_dims_scalar // CHECK-SAME: (%[[S:.+]]: f32, %[[V:.+]]: vector<1x1x4xf32>) // CHECK: %[[EXTRACT:.+]] = vector.extract %[[V]][0, 0] : vector<4xf32> from vector<1x1x4xf32> @@ -483,6 +485,8 @@ func.func @cast_away_insert_leading_one_dims_scalar(%s: f32, %v: vector<1x1x4xf3 return %0: vector<1x1x4xf32> } +// ----- + // CHECK-LABEL: func.func @cast_away_insert_leading_one_dims_scalar_scalable( // CHECK-SAME: %[[S:.*]]: f32, // CHECK-SAME: %[[V:.*]]: vector<1x1x[4]xf32>) -> vector<1x1x[4]xf32> { @@ -495,6 +499,8 @@ func.func @cast_away_insert_leading_one_dims_scalar_scalable(%s: f32, %v: vector return %0: vector<1x1x[4]xf32> } +// ----- + // CHECK-LABEL: func.func @cast_away_insert_leading_one_dims_scalar_skip_scalable_dim( // CHECK-SAME: %[[S:.*]]: f32, // CHECK-SAME: %[[V:.*]]: vector<1x[1]x4xf32>) -> vector<1x[1]x4xf32> { @@ -507,6 +513,8 @@ func.func @cast_away_insert_leading_one_dims_scalar_skip_scalable_dim(%s: f32, % return %0: vector<1x[1]x4xf32> } +// ----- + // CHECK-LABEL: func @cast_away_insert_leading_one_dims_rank1 // CHECK-SAME: (%[[S:.+]]: vector<4xf32>, %[[V:.+]]: vector<1x1x4xf32>) // CHECK: %[[BCAST:.+]] = vector.broadcast %[[S]] : vector<4xf32> to vector<1x1x4xf32> @@ -516,6 +524,8 @@ func.func @cast_away_insert_leading_one_dims_rank1(%s: vector<4xf32>, %v: vector return %0: vector<1x1x4xf32> } +// ----- + // CHECK-LABEL: func.func @cast_away_insert_leading_one_dims_rank1_scalable( // CHECK-SAME: %[[S:.*]]: vector<[4]xf32>, // CHECK-SAME: %[[V:.*]]: vector<1x1x[4]xf32>) -> vector<1x1x[4]xf32> { @@ -526,6 +536,8 @@ func.func @cast_away_insert_leading_one_dims_rank1_scalable(%s: vector<[4]xf32>, return %0: vector<1x1x[4]xf32> } +// ----- + // CHECK-LABEL: func @cast_away_insert_leading_one_dims_rank2 // CHECK-SAME: (%[[S:.+]]: vector<1x4xf32>, %[[V:.+]]: vector<1x1x4xf32>) // CHECK: %[[EXTRACT:.+]] = vector.extract %[[S]][0] : vector<4xf32> from vector<1x4xf32> @@ -536,6 +548,8 @@ func.func @cast_away_insert_leading_one_dims_rank2(%s: vector<1x4xf32>, %v: vect return %0: vector<1x1x4xf32> } +// ----- + // CHECK-LABEL: func.func @cast_away_insert_leading_one_dims_rank2_scalable( // CHECK-SAME: %[[S:.*]]: vector<1x[4]xf32>, // CHECK-SAME: %[[V:.*]]: vector<1x1x[4]xf32>) -> vector<1x1x[4]xf32> { @@ -547,6 +561,8 @@ func.func @cast_away_insert_leading_one_dims_rank2_scalable(%s: vector<1x[4]xf32 return %0: vector<1x1x[4]xf32> } +// ----- + // CHECK-LABEL: func @cast_away_insert_leading_one_dims_rank2_one_dest // CHECK-SAME: (%[[S:.+]]: vector<1x4xf32>, %[[V:.+]]: vector<1x2x1x4xf32>) // CHECK: %[[EXTRACTS:.+]] = vector.extract %[[S]][0] : vector<4xf32> from vector<1x4xf32> @@ -559,6 +575,8 @@ func.func @cast_away_insert_leading_one_dims_rank2_one_dest(%s: vector<1x4xf32>, return %0: vector<1x2x1x4xf32> } +// ----- + // CHECK-LABEL: func.func @cast_away_insert_leading_one_dims_rank2_one_dest_scalable( // CHECK-SAME: %[[S:.*]]: vector<1x[4]xf32>, // CHECK-SAME: %[[V:.*]]: vector<1x2x1x[4]xf32>) -> vector<1x2x1x[4]xf32> { @@ -572,6 +590,8 @@ func.func @cast_away_insert_leading_one_dims_rank2_one_dest_scalable(%s: vector< return %0: vector<1x2x1x[4]xf32> } +// ----- + // CHECK-LABEL: func @cast_away_insert_leading_one_dims_non_one_dest // CHECK-SAME: (%[[S:.+]]: vector<1x4xf32>, %[[V:.+]]: vector<8x1x4xf32>) // CHECK: %[[EXTRACT:.+]] = vector.extract %[[S]][0] : vector<4xf32> from vector<1x4xf32> @@ -582,6 +602,8 @@ func.func @cast_away_insert_leading_one_dims_non_one_dest(%s: vector<1x4xf32>, % return %0: vector<8x1x4xf32> } +// ----- + // CHECK-LABEL: func.func @cast_away_insert_leading_one_dims_non_one_dest_scalable( // CHECK-SAME: %[[S:.*]]: vector<1x[4]xf32>, // CHECK-SAME: %[[V:.*]]: vector<8x1x[4]xf32>) -> vector<8x1x[4]xf32> { @@ -593,6 +615,8 @@ func.func @cast_away_insert_leading_one_dims_non_one_dest_scalable(%s: vector<1x return %0: vector<8x1x[4]xf32> } +// ----- + // CHECK-LABEL: func @cast_away_insert_leading_one_dims_one_two_dest // CHECK-SAME: (%[[S:.+]]: vector<1x8xi1>, %[[V:.+]]: vector<1x1x8x1x8xi1>) // CHECK: %[[EXTRACTS:.+]] = vector.extract %[[S]][0] : vector<8xi1> from vector<1x8xi1> @@ -605,6 +629,8 @@ func.func @cast_away_insert_leading_one_dims_one_two_dest(%s: vector<1x8xi1>, %v return %0: vector<1x1x8x1x8xi1> } +// ----- + // CHECK-LABEL: func.func @cast_away_insert_leading_one_dims_one_two_dest_scalable( // CHECK-SAME: %[[S:.*]]: vector<1x[8]xi1>, // CHECK-SAME: %[[V:.*]]: vector<1x1x8x1x[8]xi1>) -> vector<1x1x8x1x[8]xi1> { @@ -618,6 +644,8 @@ func.func @cast_away_insert_leading_one_dims_one_two_dest_scalable(%s: vector<1x return %0: vector<1x1x8x1x[8]xi1> } +// ----- + // CHECK-LABEL: func.func @cast_away_constant_mask() -> vector<1x1x8x2x1xi1> { // CHECK: %[[MASK:.*]] = vector.constant_mask [6, 1, 1] : vector<8x2x1xi1> // CHECK: %[[BCAST:.*]] = vector.broadcast %[[MASK]] : vector<8x2x1xi1> to vector<1x1x8x2x1xi1> @@ -626,3 +654,13 @@ func.func @cast_away_constant_mask() -> vector<1x1x8x2x1xi1> { %0 = vector.constant_mask [1, 1, 6, 1, 1] : vector<1x1x8x2x1xi1> return %0: vector<1x1x8x2x1xi1> } + +// ----- + +// CHECK-LABEL: func.func @drop_unit_dims_scalar_cond_select( +// CHECK: arith.select {{.*}} : vector<16xi1> +func.func @drop_unit_dims_scalar_cond_select(%cond: i1, %arg0: vector<1x16xi1>, %arg1: vector<1x16xi1>) -> vector<1x16xi1> { + %sel = arith.select %cond, %arg0, %arg1 : vector<1x16xi1> + return %sel : vector<1x16xi1> +} + diff --git a/mlir/test/IR/greedy-pattern-rewriter-driver.mlir b/mlir/test/IR/greedy-pattern-rewrite-driver-bottom-up.mlir similarity index 100% rename from mlir/test/IR/greedy-pattern-rewriter-driver.mlir rename to mlir/test/IR/greedy-pattern-rewrite-driver-bottom-up.mlir diff --git a/mlir/test/IR/greedy-pattern-rewrite-driver-top-down.mlir b/mlir/test/IR/greedy-pattern-rewrite-driver-top-down.mlir new file mode 100644 index 0000000000000..a362d6f99b947 --- /dev/null +++ b/mlir/test/IR/greedy-pattern-rewrite-driver-top-down.mlir @@ -0,0 +1,58 @@ +// RUN: mlir-opt %s -test-patterns="max-iterations=1 top-down=true" \ +// RUN: --split-input-file | FileCheck %s + +// Tests for https://github.com/llvm/llvm-project/issues/86765. Ensure +// that operands of a dead op are added to the worklist even if the same value +// appears multiple times as an operand. + +// 2 uses of the same operand + +// CHECK: func.func @f(%arg0: i1) { +// CHECK-NEXT: return +// CHECK-NEXT: } +func.func @f(%arg0: i1) { + %0 = arith.constant 0 : i32 + %if = scf.if %arg0 -> (i32) { + scf.yield %0 : i32 + } else { + scf.yield %0 : i32 + } + %dead_leaf = arith.addi %if, %if : i32 + return +} + +// ----- + +// 3 uses of the same operand + +// CHECK: func.func @f() { +// CHECK-NEXT: return +// CHECK-NEXT: } +func.func @f() { + %0 = arith.constant 0 : i1 + %if = scf.if %0 -> (i1) { + scf.yield %0 : i1 + } else { + scf.yield %0 : i1 + } + %dead_leaf = arith.select %if, %if, %if : i1 + return +} + +// ----- + +// 2 uses of the same operand, op has 3 operands + +// CHECK: func.func @f() { +// CHECK-NEXT: return +// CHECK-NEXT: } +func.func @f() { + %0 = arith.constant 0 : i1 + %if = scf.if %0 -> (i1) { + scf.yield %0 : i1 + } else { + scf.yield %0 : i1 + } + %dead_leaf = arith.select %0, %if, %if : i1 + return +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print.mlir index 98d76ba350cbd..7758ca77dce9e 100755 --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_print.mlir @@ -120,6 +120,14 @@ ) }> +#COOAoS = #sparse_tensor.encoding<{ + map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton) +}> + +#COOSoA = #sparse_tensor.encoding<{ + map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton(soa)) +}> + module { // @@ -161,6 +169,8 @@ module { %h = sparse_tensor.convert %x : tensor<4x8xi32> to tensor<4x8xi32, #BSCC> %i = sparse_tensor.convert %x : tensor<4x8xi32> to tensor<4x8xi32, #BSR0> %j = sparse_tensor.convert %x : tensor<4x8xi32> to tensor<4x8xi32, #BSC0> + %AoS = sparse_tensor.convert %x : tensor<4x8xi32> to tensor<4x8xi32, #COOAoS> + %SoA = sparse_tensor.convert %x : tensor<4x8xi32> to tensor<4x8xi32, #COOSoA> // CHECK-NEXT: ---- Sparse Tensor ---- // CHECK-NEXT: nse = 5 @@ -274,19 +284,42 @@ module { // CHECK-NEXT: ---- sparse_tensor.print %j : tensor<4x8xi32, #BSC0> + // CHECK-NEXT: ---- Sparse Tensor ---- + // CHECK-NEXT: nse = 5 + // CHECK-NEXT: dim = ( 4, 8 ) + // CHECK-NEXT: lvl = ( 4, 8 ) + // CHECK-NEXT: pos[0] : ( 0, 5, + // CHECK-NEXT: crd[0] : ( 0, 0, 0, 2, 3, 2, 3, 3, 3, 5, + // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, + // CHECK-NEXT: ---- + sparse_tensor.print %AoS : tensor<4x8xi32, #COOAoS> + + // CHECK-NEXT: ---- Sparse Tensor ---- + // CHECK-NEXT: nse = 5 + // CHECK-NEXT: dim = ( 4, 8 ) + // CHECK-NEXT: lvl = ( 4, 8 ) + // CHECK-NEXT: pos[0] : ( 0, 5, + // CHECK-NEXT: crd[0] : ( 0, 0, 3, 3, 3, + // CHECK-NEXT: crd[1] : ( 0, 2, 2, 3, 5, + // CHECK-NEXT: values : ( 1, 2, 3, 4, 5, + // CHECK-NEXT: ---- + sparse_tensor.print %SoA : tensor<4x8xi32, #COOSoA> + // Release the resources. - bufferization.dealloc_tensor %XO : tensor<4x8xi32, #AllDense> - bufferization.dealloc_tensor %XT : tensor<4x8xi32, #AllDenseT> - bufferization.dealloc_tensor %a : tensor<4x8xi32, #CSR> - bufferization.dealloc_tensor %b : tensor<4x8xi32, #DCSR> - bufferization.dealloc_tensor %c : tensor<4x8xi32, #CSC> - bufferization.dealloc_tensor %d : tensor<4x8xi32, #DCSC> - bufferization.dealloc_tensor %e : tensor<4x8xi32, #BSR> - bufferization.dealloc_tensor %f : tensor<4x8xi32, #BSRC> - bufferization.dealloc_tensor %g : tensor<4x8xi32, #BSC> - bufferization.dealloc_tensor %h : tensor<4x8xi32, #BSCC> - bufferization.dealloc_tensor %i : tensor<4x8xi32, #BSR0> - bufferization.dealloc_tensor %j : tensor<4x8xi32, #BSC0> + bufferization.dealloc_tensor %XO : tensor<4x8xi32, #AllDense> + bufferization.dealloc_tensor %XT : tensor<4x8xi32, #AllDenseT> + bufferization.dealloc_tensor %a : tensor<4x8xi32, #CSR> + bufferization.dealloc_tensor %b : tensor<4x8xi32, #DCSR> + bufferization.dealloc_tensor %c : tensor<4x8xi32, #CSC> + bufferization.dealloc_tensor %d : tensor<4x8xi32, #DCSC> + bufferization.dealloc_tensor %e : tensor<4x8xi32, #BSR> + bufferization.dealloc_tensor %f : tensor<4x8xi32, #BSRC> + bufferization.dealloc_tensor %g : tensor<4x8xi32, #BSC> + bufferization.dealloc_tensor %h : tensor<4x8xi32, #BSCC> + bufferization.dealloc_tensor %i : tensor<4x8xi32, #BSR0> + bufferization.dealloc_tensor %j : tensor<4x8xi32, #BSC0> + bufferization.dealloc_tensor %AoS : tensor<4x8xi32, #COOAoS> + bufferization.dealloc_tensor %SoA : tensor<4x8xi32, #COOSoA> return } diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp index aecc33c0497a7..31cd54e3de35c 100644 --- a/openmp/libomptarget/DeviceRTL/src/Debug.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp @@ -33,10 +33,10 @@ void __assert_fail(const char *expr, const char *file, unsigned line, void __assert_fail_internal(const char *expr, const char *msg, const char *file, unsigned line, const char *function) { if (msg) { - PRINTF("%s:%u: %s: Assertion %s (`%s') failed.\n", file, line, function, + PRINTF("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function, msg, expr); } else { - PRINTF("%s:%u: %s: Assertion `%s' failed.\n", file, line, function, expr); + PRINTF("%s:%u: %s: Assertion `%s` failed.\n", file, line, function, expr); } __builtin_trap(); } diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index c66a58b9406d4..469298708adde 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3187,6 +3187,27 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { AsyncInfoWrapperTy &AsyncInfoWrapper) override { AMDGPUDeviceTy &DstDevice = static_cast(DstGenericDevice); + // For large transfers use synchronous behavior. + if (Size >= OMPX_MaxAsyncCopyBytes) { + if (AsyncInfoWrapper.hasQueue()) + if (auto Err = synchronize(AsyncInfoWrapper)) + return Err; + + AMDGPUSignalTy Signal; + if (auto Err = Signal.init()) + return Err; + + if (auto Err = utils::asyncMemCopy( + useMultipleSdmaEngines(), DstPtr, DstDevice.getAgent(), SrcPtr, + getAgent(), (uint64_t)Size, 0, nullptr, Signal.get())) + return Err; + + if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) + return Err; + + return Signal.deinit(); + } + AMDGPUStreamTy *Stream = nullptr; if (auto Err = getStream(AsyncInfoWrapper, Stream)) return Err; diff --git a/openmp/libomptarget/plugins-nextgen/common/include/RPC.h b/openmp/libomptarget/plugins-nextgen/common/include/RPC.h index 2e39b3f299c88..b621cc0da4587 100644 --- a/openmp/libomptarget/plugins-nextgen/common/include/RPC.h +++ b/openmp/libomptarget/plugins-nextgen/common/include/RPC.h @@ -16,6 +16,7 @@ #ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H #define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H +#include "llvm/ADT/DenseMap.h" #include "llvm/Support/Error.h" #include @@ -32,8 +33,6 @@ class DeviceImageTy; /// these routines will perform no action. struct RPCServerTy { public: - RPCServerTy(uint32_t NumDevices); - /// Check if this device image is using an RPC server. This checks for the /// precense of an externally visible symbol in the device image that will /// be present whenever RPC code is called. @@ -56,7 +55,9 @@ struct RPCServerTy { /// memory associated with the k llvm::Error deinitDevice(plugin::GenericDeviceTy &Device); - ~RPCServerTy(); +private: + /// Array from this device's identifier to its attached devices. + llvm::SmallVector Handles; }; } // namespace llvm::omp::target diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp index 688ce4462a518..a4c3bfca0619b 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp @@ -1604,7 +1604,7 @@ Error GenericPluginTy::init() { GlobalHandler = createGlobalHandler(); assert(GlobalHandler && "Invalid global handler"); - RPCServer = new RPCServerTy(NumDevices); + RPCServer = new RPCServerTy(); assert(RPCServer && "Invalid RPC server"); return Plugin::success(); diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp index f46b27701b5b9..fab0f6838f4a8 100644 --- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp +++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp @@ -21,14 +21,6 @@ using namespace llvm; using namespace omp; using namespace target; -RPCServerTy::RPCServerTy(uint32_t NumDevices) { -#ifdef LIBOMPTARGET_RPC_SUPPORT - // If this fails then something is catastrophically wrong, just exit. - if (rpc_status_t Err = rpc_init(NumDevices)) - FATAL_MESSAGE(1, "Error initializing the RPC server: %d\n", Err); -#endif -} - llvm::Expected RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device, plugin::GenericGlobalHandlerTy &Handler, @@ -44,7 +36,6 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, plugin::GenericGlobalHandlerTy &Handler, plugin::DeviceImageTy &Image) { #ifdef LIBOMPTARGET_RPC_SUPPORT - uint32_t DeviceId = Device.getDeviceId(); auto Alloc = [](uint64_t Size, void *Data) { plugin::GenericDeviceTy &Device = *reinterpret_cast(Data); @@ -52,10 +43,12 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, }; uint64_t NumPorts = std::min(Device.requestedRPCPortCount(), RPC_MAXIMUM_PORT_COUNT); - if (rpc_status_t Err = rpc_server_init(DeviceId, NumPorts, + rpc_device_t RPCDevice; + if (rpc_status_t Err = rpc_server_init(&RPCDevice, NumPorts, Device.getWarpSize(), Alloc, &Device)) return plugin::Plugin::error( - "Failed to initialize RPC server for device %d: %d", DeviceId, Err); + "Failed to initialize RPC server for device %d: %d", + Device.getDeviceId(), Err); // Register a custom opcode handler to perform plugin specific allocation. auto MallocHandler = [](rpc_port_t Port, void *Data) { @@ -70,10 +63,10 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, Data); }; if (rpc_status_t Err = - rpc_register_callback(DeviceId, RPC_MALLOC, MallocHandler, &Device)) + rpc_register_callback(RPCDevice, RPC_MALLOC, MallocHandler, &Device)) return plugin::Plugin::error( - "Failed to register RPC malloc handler for device %d: %d\n", DeviceId, - Err); + "Failed to register RPC malloc handler for device %d: %d\n", + Device.getDeviceId(), Err); // Register a custom opcode handler to perform plugin specific deallocation. auto FreeHandler = [](rpc_port_t Port, void *Data) { @@ -88,10 +81,10 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, Data); }; if (rpc_status_t Err = - rpc_register_callback(DeviceId, RPC_FREE, FreeHandler, &Device)) + rpc_register_callback(RPCDevice, RPC_FREE, FreeHandler, &Device)) return plugin::Plugin::error( - "Failed to register RPC free handler for device %d: %d\n", DeviceId, - Err); + "Failed to register RPC free handler for device %d: %d\n", + Device.getDeviceId(), Err); // Get the address of the RPC client from the device. void *ClientPtr; @@ -104,17 +97,20 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, sizeof(void *), nullptr)) return Err; - const void *ClientBuffer = rpc_get_client_buffer(DeviceId); + const void *ClientBuffer = rpc_get_client_buffer(RPCDevice); if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer, rpc_get_client_size(), nullptr)) return Err; + Handles.resize(Device.getDeviceId() + 1); + Handles[Device.getDeviceId()] = RPCDevice.handle; #endif return Error::success(); } Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) { #ifdef LIBOMPTARGET_RPC_SUPPORT - if (rpc_status_t Err = rpc_handle_server(Device.getDeviceId())) + rpc_device_t RPCDevice{Handles[Device.getDeviceId()]}; + if (rpc_status_t Err = rpc_handle_server(RPCDevice)) return plugin::Plugin::error( "Error while running RPC server on device %d: %d", Device.getDeviceId(), Err); @@ -124,22 +120,16 @@ Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) { Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) { #ifdef LIBOMPTARGET_RPC_SUPPORT + rpc_device_t RPCDevice{Handles[Device.getDeviceId()]}; auto Dealloc = [](void *Ptr, void *Data) { plugin::GenericDeviceTy &Device = *reinterpret_cast(Data); Device.free(Ptr, TARGET_ALLOC_HOST); }; - if (rpc_status_t Err = - rpc_server_shutdown(Device.getDeviceId(), Dealloc, &Device)) + if (rpc_status_t Err = rpc_server_shutdown(RPCDevice, Dealloc, &Device)) return plugin::Plugin::error( "Failed to shut down RPC server for device %d: %d", Device.getDeviceId(), Err); #endif return Error::success(); } - -RPCServerTy::~RPCServerTy() { -#ifdef LIBOMPTARGET_RPC_SUPPORT - rpc_shutdown(); -#endif -} diff --git a/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c new file mode 100644 index 0000000000000..6b9b765a74d82 --- /dev/null +++ b/openmp/libomptarget/test/offloading/d2d_memcpy_sync.c @@ -0,0 +1,72 @@ +// RUN: %libomptarget-compile-generic && \ +// RUN: env LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES=0 %libomptarget-run-generic | \ +// RUN: %fcheck-generic -allow-empty +// REQUIRES: amdgcn-amd-amdhsa + +#include +#include +#include +#include + +const int magic_num = 7; + +int main(int argc, char *argv[]) { + const int N = 128; + const int num_devices = omp_get_num_devices(); + + // No target device, just return + if (num_devices == 0) { + printf("PASS\n"); + return 0; + } + + const int src_device = 0; + int dst_device = num_devices - 1; + + int length = N * sizeof(int); + int *src_ptr = omp_target_alloc(length, src_device); + int *dst_ptr = omp_target_alloc(length, dst_device); + + if (!src_ptr || !dst_ptr) { + printf("FAIL\n"); + return 1; + } + +#pragma omp target teams distribute parallel for device(src_device) \ + is_device_ptr(src_ptr) + for (int i = 0; i < N; ++i) { + src_ptr[i] = magic_num; + } + + if (omp_target_memcpy(dst_ptr, src_ptr, length, 0, 0, dst_device, + src_device)) { + printf("FAIL\n"); + return 1; + } + + int *buffer = malloc(length); + if (!buffer) { + printf("FAIL\n"); + return 1; + } + +#pragma omp target teams distribute parallel for device(dst_device) \ + map(from : buffer[0 : N]) is_device_ptr(dst_ptr) + for (int i = 0; i < N; ++i) { + buffer[i] = dst_ptr[i] + magic_num; + } + + for (int i = 0; i < N; ++i) + assert(buffer[i] == 2 * magic_num); + + printf("PASS\n"); + + // Free host and device memory + free(buffer); + omp_target_free(src_ptr, src_device); + omp_target_free(dst_ptr, dst_device); + + return 0; +} + +// CHECK: PASS diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 6b3a63f113772..d29d2690c771c 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -4431,8 +4431,10 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, #endif KMP_MB(); - /* first, try to get one from the thread pool */ - if (__kmp_thread_pool) { + /* first, try to get one from the thread pool unless allocating thread is + * the main hidden helper thread. The hidden helper team should always + * allocate new OS threads. */ + if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) { new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; if (new_thr == __kmp_thread_pool_insert_pt) { @@ -4497,7 +4499,7 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, } /* no, well fork a new one */ - KMP_ASSERT(__kmp_nth == __kmp_all_nth); + KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth); KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); #if KMP_USE_MONITOR diff --git a/openmp/runtime/test/tasking/hidden_helper_task/issue-87117.c b/openmp/runtime/test/tasking/hidden_helper_task/issue-87117.c new file mode 100644 index 0000000000000..23080982f49e1 --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/issue-87117.c @@ -0,0 +1,36 @@ +// RUN: %libomp-compile +// RUN: env KMP_HOT_TEAMS_MODE=0 KMP_HOT_TEAMS_MAX_LEVEL=1 %libomp-run +// +// Force the defaults of: +// KMP_HOT_TEAMS_MODE=0 means free extra threads after parallel +// involving non-hot team +// KMP_HOT_TEAMS_MAX_LEVEL=1 means only the initial outer team +// is a hot team. + +#include +#include +#include + +int main() { + int a; + omp_set_max_active_levels(2); +// This nested parallel creates extra threads on the thread pool +#pragma omp parallel num_threads(2) + { +#pragma omp parallel num_threads(2) + { +#pragma omp atomic + a++; + } + } + +// Causes assert if hidden helper thread tries to allocate from thread pool +// instead of creating new OS threads +#pragma omp parallel num_threads(1) + { +#pragma omp target nowait + { a++; } + } + + return EXIT_SUCCESS; +} diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index c01986815afe1..1bf6bee10952b 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -2056,6 +2056,7 @@ cc_library( "//llvm:Demangle", "//llvm:Support", "//llvm:TextAPI", + "//llvm:TextAPIBinaryReader", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl index 7d815bc4a2299..7dc12bade2605 100644 --- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl @@ -78,7 +78,6 @@ def libc_function( its deps. **kwargs: Other attributes relevant for a cc_library. For example, deps. """ - # We use the explicit equals pattern here because append and += mutate the # original list, where this creates a new list and stores it in deps. copts = copts or [] @@ -87,7 +86,15 @@ def libc_function( "-fno-builtin", "-fno-lax-vector-conversions", "-ftrivial-auto-var-init=pattern", + "-fno-omit-frame-pointer", + "-fstack-protector-strong", ] + # x86 targets have -mno-omit-leaf-frame-pointer. + platform_copts = selects.with_or({ + PLATFORM_CPU_X86_64: ["-mno-omit-leaf-frame-pointer"], + "//conditions:default": [] + }) + copts = copts + platform_copts # We compile the code twice, the first target is suffixed with ".__internal__" and contains the # C++ functions in the "LIBC_NAMESPACE" namespace. This allows us to test the function in the diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 15e477351fe3c..9cfcb7d3838ed 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -7,6 +7,7 @@ load("@bazel_skylib//rules:expand_template.bzl", "expand_template") load("//mlir:tblgen.bzl", "td_library") load(":binary_alias.bzl", "binary_alias") load(":config.bzl", "llvm_config_defines") +load(":driver.bzl", "generate_driver_selects", "generate_driver_tools_def", "llvm_driver_cc_binary", "select_driver_tools") load(":enum_targets_gen.bzl", "enum_targets_gen") load(":targets.bzl", "llvm_targets") load(":tblgen.bzl", "gentbl") @@ -605,6 +606,32 @@ cc_library( ], ) +# Command line flag to control which tools get included in the llvm driver binary. +# The macro also generates config_setting targets used by select_driver_tools(). +generate_driver_selects(name = "driver-tools") + +generate_driver_tools_def( + name = "gen_llvm_driver_tools_def", + out = "LLVMDriverTools.def", + driver_tools = select_driver_tools(":driver-tools"), +) + +# Workaround inability to put `.def` files into `srcs` with a library +cc_library( + name = "llvm_driver_tools_def_lib", + includes = ["."], + textual_hdrs = ["LLVMDriverTools.def"], +) + +cc_binary( + name = "llvm", + srcs = glob(["tools/llvm-driver/*.cpp"]), + deps = [ + ":Support", + ":llvm_driver_tools_def_lib", + ] + select_driver_tools(":driver-tools"), +) + cc_binary( name = "llvm-min-tblgen", srcs = [ @@ -1011,6 +1038,7 @@ cc_library( hdrs = ["include/llvm/TextAPI/DylibReader.h"], copts = llvm_copts, deps = [ + ":DebugInfoDWARF", ":Object", ":Support", ":TargetParser", @@ -3304,22 +3332,10 @@ cc_binary( ], ) -expand_template( - name = "ar_main", - out = "llvm-ar-driver.cpp", - substitutions = { - "@TOOL_NAME@": "llvm_ar", - }, - template = "cmake/modules/llvm-driver-template.cpp.in", -) - -cc_binary( - name = "llvm-ar", - srcs = glob([ - "tools/llvm-ar/*.cpp", - ]) + ["llvm-ar-driver.cpp"], +cc_library( + name = "llvm-ar-lib", + srcs = glob(["tools/llvm-ar/*.cpp"]), copts = llvm_copts, - stamp = 0, deps = [ ":AllTargetsAsmParsers", ":AllTargetsCodeGens", @@ -3333,6 +3349,12 @@ cc_binary( ], ) +llvm_driver_cc_binary( + name = "llvm-ar", + stamp = 0, + deps = [":llvm-ar-lib"], +) + # We need to run llvm-ar with different basenames to make it run with # different behavior. binary_alias( @@ -4150,22 +4172,10 @@ gentbl( td_srcs = ["include/llvm/Option/OptParser.td"], ) -expand_template( - name = "nm_main", - out = "llvm-nm-driver.cpp", - substitutions = { - "@TOOL_NAME@": "llvm_nm", - }, - template = "cmake/modules/llvm-driver-template.cpp.in", -) - -cc_binary( - name = "llvm-nm", - srcs = glob([ - "tools/llvm-nm/*.cpp", - ]) + ["llvm-nm-driver.cpp"], +cc_library( + name = "llvm-nm-lib", + srcs = glob(["tools/llvm-nm/*.cpp"]), copts = llvm_copts, - stamp = 0, deps = [ ":AllTargetsAsmParsers", ":AllTargetsCodeGens", @@ -4182,6 +4192,12 @@ cc_binary( ], ) +llvm_driver_cc_binary( + name = "llvm-nm", + stamp = 0, + deps = [":llvm-nm-lib"], +) + gentbl( name = "llvm-objcopy-opts", strip_include_prefix = "tools/llvm-objcopy", @@ -4633,22 +4649,10 @@ gentbl( td_srcs = ["include/llvm/Option/OptParser.td"], ) -expand_template( - name = "size_main", - out = "llvm-size-driver.cpp", - substitutions = { - "@TOOL_NAME@": "llvm_size", - }, - template = "cmake/modules/llvm-driver-template.cpp.in", -) - -cc_binary( - name = "llvm-size", - srcs = glob([ - "tools/llvm-size/*.cpp", - ]) + ["llvm-size-driver.cpp"], +cc_library( + name = "llvm-size-lib", + srcs = glob(["tools/llvm-size/*.cpp"]), copts = llvm_copts, - stamp = 0, deps = [ ":Object", ":Option", @@ -4657,6 +4661,12 @@ cc_binary( ], ) +llvm_driver_cc_binary( + name = "llvm-size", + stamp = 0, + deps = [":llvm-size-lib"], +) + cc_binary( name = "llvm-split", srcs = glob([ diff --git a/utils/bazel/llvm-project-overlay/llvm/driver.bzl b/utils/bazel/llvm-project-overlay/llvm/driver.bzl new file mode 100644 index 0000000000000..bd0d26d64f481 --- /dev/null +++ b/utils/bazel/llvm-project-overlay/llvm/driver.bzl @@ -0,0 +1,182 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +"""Configuration for the llvm-driver tool.""" + +load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo") +load("@bazel_skylib//rules:expand_template.bzl", "expand_template") + +# Mapping from every tool to the cc_library that implements the tool's entrypoint. +# TODO: uncomment the remaining targets after splitting them +# into separate library/binary targets. +_TOOLS = { + # "clang-scan-deps": "//clang:clang-scan-deps-lib", + # "clang": "//clang:clang-driver", + # "dsymutil": "//llvm:dsymutil-lib", + # "lld": "//lld:lld-lib", + "llvm-ar": "//llvm:llvm-ar-lib", + # "llvm-cxxfilt": "//llvm:llvm-cxxfilt-lib", + # "llvm-dwp": "//llvm:llvm-dwp-lib", + # "llvm-gsymutil": "//llvm:llvm-gsymutil-lib", + # "llvm-ifs": "//llvm:llvm-ifs-lib", + # "llvm-libtool-darwin": "//llvm:llvm-libtool-darwin-lib", + # "llvm-lipo": "//llvm:llvm-lipo-lib", + # "llvm-ml": "//llvm:llvm-ml-lib", + # "llvm-mt": "//llvm:llvm-mt-lib", + "llvm-nm": "//llvm:llvm-nm-lib", + # "llvm-objcopy": "//llvm:llvm-objcopy-lib", + # "llvm-objdump": "//llvm:llvm-objdump-lib", + # "llvm-profdata": "//llvm:llvm-profdata-lib", + # "llvm-rc": "//llvm:llvm-rc-lib", + # "llvm-readobj": "//llvm:llvm-readobj-lib", + "llvm-size": "//llvm:llvm-size-lib", + # "llvm-symbolizer": "//llvm:llvm-symbolizer-lib", + # "sancov": "//llvm:sancov-lib", +} + +# Tools automatically get their own name as an alias, but there may be additional +# aliases for a given tool. +_EXTRA_ALIASES = { + "clang": ["clang++", "clang-cl", "clang-cpp"], + "lld": ["lld-link", "ld.lld", "ld64.lld", "wasm-ld"], + "llvm-ar": ["ranlib", "lib", "dlltool"], + "llvm-objcopy": ["bitcode-strip", "install-name-tool", "strip"], + "llvm-objdump": ["otool"], + "llvm-rc": ["windres"], + "llvm-readobj": ["readelf"], + "llvm-symbolizer": ["addr2line"], +} + +def _validated_string_list_flag_impl(ctx): + invalid_values = [v for v in ctx.build_setting_value if v not in ctx.attr.values] + if invalid_values: + fail("Tool(s) [{}] are not in the known list of tools: {}".format( + ", ".join(invalid_values), + ", ".join(ctx.attr.values), + )) + return BuildSettingInfo(value = ctx.build_setting_value) + +# Like string_list_flag, but with the validation that string_flag provides. +_validated_string_list_flag = rule( + implementation = _validated_string_list_flag_impl, + build_setting = config.string_list(flag = True), + attrs = { + "values": attr.string_list( + doc = "The list of allowed values for this setting. An error is raised if any other value is given.", + ), + }, + doc = "A string list-typed build setting that can be set on the command line", +) + +def generate_driver_selects(name): + """Generates flags and config settings to configure the tool list. + + By default, all supported tools are included in the "llvm" driver binary. + To build only a subset, specify just the subset you want as the flag. + For example, to produce a binary with just llvm-nm and llvm-size, run: + + $ bazel build \ + --@llvm-project//llvm:driver-tools=llvm-nm,llvm-size \ + @llvm-project//llvm:llvm + + Note: this assumes the flag name is "driver-tools" by being invoked as: + generate_driver_selects(name = "driver-tools") + + Args: + name: the name of the flag that configures which tools are included. + """ + + _validated_string_list_flag( + name = name, + build_setting_default = _TOOLS.keys(), + values = _TOOLS.keys(), + ) + for tool in _TOOLS.keys(): + native.config_setting( + name = "{}-include-{}".format(name, tool), + flag_values = {name: tool}, + ) + +def select_driver_tools(flag): + """Produce a list of tool deps based on generate_driver_selects(). + + Args: + flag: name that was used for generate_driver_selects(). + Returns: + List of tool deps based on generate_driver_selects(). + """ + tools = [] + for tool, target in _TOOLS.items(): + tools += select({ + "{}-include-{}".format(flag, tool): [target], + "//conditions:default": [], + }) + return tools + +def _generate_driver_tools_def_impl(ctx): + # Depending on how the LLVM build files are included, + # it may or may not have the @llvm-project repo prefix. + # Compare just on the name. We could also include the package, + # but the name itself is unique in practice. + label_to_name = {Label(v).name: k for k, v in _TOOLS.items()} + + # Reverse sort by the *main* tool name, but keep aliases together. + # This is consistent with how tools/llvm-driver/CMakeLists.txt does it, + # and this makes sure that more specific tools are checked first. + # For example, "clang-scan-deps" should not match "clang". + tools = [label_to_name[tool.label.name] for tool in ctx.attr.driver_tools] + tool_alias_pairs = [] + for tool_name in reversed(tools): + tool_alias_pairs.append((tool_name, tool_name)) + for extra_alias in _EXTRA_ALIASES.get(tool_name, []): + tool_alias_pairs.append((tool_name, extra_alias)) + + lines = [ + 'LLVM_DRIVER_TOOL("{alias}", {tool})'.format( + tool = tool_name.replace("-", "_"), + alias = alias.removeprefix("llvm-"), + ) + for (tool_name, alias) in tool_alias_pairs + ] + lines.append("#undef LLVM_DRIVER_TOOL") + + ctx.actions.write( + output = ctx.outputs.out, + content = "\n".join(lines), + ) + +generate_driver_tools_def = rule( + implementation = _generate_driver_tools_def_impl, + doc = """Generate a list of LLVM_DRIVER_TOOL macros. +See tools/llvm-driver/CMakeLists.txt for the reference implementation.""", + attrs = { + "driver_tools": attr.label_list( + doc = "List of tools to include in the generated header. Use select_driver_tools() to provide this.", + providers = [CcInfo], + ), + "out": attr.output( + doc = "Name of the generated .def output file.", + mandatory = True, + ), + }, +) + +def llvm_driver_cc_binary( + name, + deps = None, + **kwargs): + """cc_binary wrapper for binaries using the llvm-driver template.""" + expand_template( + name = "_gen_" + name, + out = name + "-driver.cpp", + substitutions = {"@TOOL_NAME@": name.replace("-", "_")}, + template = "//llvm:cmake/modules/llvm-driver-template.cpp.in", + ) + deps = deps or [] + native.cc_binary( + name = name, + srcs = [name + "-driver.cpp"], + deps = deps + ["//llvm:Support"], + **kwargs + ) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 6c3ea17c69018..ddd3e69e6ce35 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -351,7 +351,6 @@ cc_library( ":BytecodeOpInterfaceIncGen", ":CallOpInterfacesIncGen", ":DataLayoutInterfacesIncGen", - ":InferTypeOpInterfaceIncGen", ":OpAsmInterfaceIncGen", ":RegionKindInterfaceIncGen", ":SideEffectInterfacesIncGen", @@ -1005,6 +1004,7 @@ cc_library( ":CAPIDebugHeaders", ":CAPIIRHeaders", ":MLIRBindingsPythonHeaders", + ":Support", "//llvm:Support", "@local_config_python//:python_headers", "@pybind11", @@ -1457,15 +1457,12 @@ cc_library( ":AffineAnalysis", ":AffineDialect", ":AffineTransformOpsIncGen", - ":AffineTransforms", ":AffineUtils", - ":FuncDialect", + ":BytecodeOpInterface", ":IR", ":TransformDialect", ":TransformDialectInterfaces", ":TransformUtils", - ":Transforms", - ":VectorDialect", ], ) @@ -1547,10 +1544,10 @@ cc_library( deps = [ ":AMDGPUIncGen", ":ArithDialect", + ":BytecodeOpInterface", ":GPUDialect", ":IR", ":SideEffectInterfaces", - "//llvm:Core", "//llvm:Support", ], ) @@ -1646,9 +1643,7 @@ cc_library( ":SideEffectInterfaces", ":Support", ":TransformUtils", - ":Transforms", ":VectorDialect", - "//llvm:Support", ], ) @@ -1658,7 +1653,6 @@ cc_library( hdrs = glob(["include/mlir/Dialect/AMDGPU/Utils/*.h"]), includes = ["include"], deps = [ - ":AMDGPUDialect", ":Support", "//llvm:Support", ], @@ -1921,11 +1915,10 @@ cc_library( includes = ["include"], deps = [ ":ArmNeonIncGen", + ":BytecodeOpInterface", ":IR", ":SideEffectInterfaces", ":VectorDialect", - "//llvm:Core", - "//llvm:Support", ], ) @@ -1943,7 +1936,6 @@ cc_library( ":LLVMDialect", ":Support", ":TransformUtils", - ":Transforms", ":VectorDialect", ], ) @@ -1973,15 +1965,9 @@ cc_library( deps = [ ":ArmNeonDialect", ":ConversionPassIncGen", - ":FuncDialect", ":IR", - ":MemRefDialect", - ":OpenACCDialect", ":Pass", - ":SCFDialect", - ":Support", ":TransformUtils", - ":Transforms", ":VectorDialect", ], ) @@ -2187,13 +2173,13 @@ cc_library( ":ArmSMEIntrinsicOpsIncGen", ":ArmSMEOpInterfacesIncGen", ":ArmSMEOpsIncGen", + ":BytecodeOpInterface", ":IR", ":LLVMDialect", ":MemRefDialect", ":SCFDialect", ":SideEffectInterfaces", ":VectorDialect", - "//llvm:Core", "//llvm:Support", ], ) @@ -2204,7 +2190,6 @@ cc_library( hdrs = glob(["include/mlir/Dialect/ArmSME/Transforms/*.h"]), includes = ["include"], deps = [ - ":ArithDialect", ":ArithUtils", ":ArmSMEDialect", ":ArmSMETransformsPassIncGen", @@ -2217,11 +2202,8 @@ cc_library( ":LLVMDialect", ":MemRefDialect", ":Pass", - ":SCFDialect", ":SCFTransforms", ":TransformUtils", - ":Transforms", - ":VectorDialect", "//llvm:Support", ], ) @@ -2238,7 +2220,6 @@ cc_library( ":Pass", ":SCFDialect", ":TransformUtils", - ":Transforms", ], ) @@ -2258,7 +2239,6 @@ cc_library( ":MemRefDialect", ":Pass", ":TransformUtils", - ":Transforms", ":VectorDialect", ], ) @@ -2328,11 +2308,11 @@ cc_library( includes = ["include"], deps = [ ":ArmSVEIncGen", + ":BytecodeOpInterface", ":IR", ":LLVMDialect", ":SideEffectInterfaces", ":VectorDialect", - "//llvm:Core", "//llvm:Support", ], ) @@ -2442,11 +2422,10 @@ cc_library( includes = ["include"], deps = [ ":AMXIncGen", + ":BytecodeOpInterface", ":IR", ":LLVMDialect", ":SideEffectInterfaces", - "//llvm:Core", - "//llvm:Support", ], ) @@ -2457,12 +2436,9 @@ cc_library( includes = ["include"], deps = [ ":AMXDialect", - ":FuncDialect", ":IR", ":LLVMCommonConversion", ":LLVMDialect", - "//llvm:Core", - "//llvm:Support", ], ) @@ -2535,13 +2511,12 @@ cc_library( hdrs = ["include/mlir/Dialect/X86Vector/X86VectorDialect.h"], includes = ["include"], deps = [ + ":BytecodeOpInterface", ":IR", ":InferTypeOpInterface", ":LLVMDialect", ":SideEffectInterfaces", ":X86VectorIncGen", - "//llvm:Core", - "//llvm:Support", ], ) @@ -2552,14 +2527,12 @@ cc_library( includes = ["include"], deps = [ ":ArithDialect", - ":FuncDialect", ":IR", ":LLVMCommonConversion", ":LLVMDialect", ":VectorDialect", ":VectorUtils", ":X86VectorDialect", - "//llvm:Core", "//llvm:Support", ], ) @@ -2733,6 +2706,7 @@ cc_library( ], includes = ["include"], deps = [ + ":BytecodeOpInterface", ":Dialect", ":IR", ":IRDLAttributesIncGen", @@ -2742,6 +2716,7 @@ cc_library( ":IRDLOpsIncGen", ":IRDLTypesIncGen", ":InferTypeOpInterface", + ":SideEffectInterfaces", ":Support", "//llvm:Core", "//llvm:Support", @@ -2870,7 +2845,6 @@ cc_library( ":TensorTransforms", ":TilingInterface", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -2916,6 +2890,7 @@ cc_library( ":AffineUtils", ":ArithDialect", ":ArithUtils", + ":BytecodeOpInterface", ":DialectUtils", ":FuncDialect", ":IR", @@ -2924,11 +2899,9 @@ cc_library( ":SCFTransformOpsIncGen", ":SCFTransforms", ":SCFUtils", - ":SideEffectInterfaces", ":TransformDialect", ":TransformDialectInterfaces", ":VectorDialect", - "//llvm:Support", ], ) @@ -3154,14 +3127,11 @@ cc_library( includes = ["include"], deps = [ ":IR", - ":LinalgDialect", ":LinalgTransformOps", ":SparseTensorDialect", ":SparseTensorTransformOpsIncGen", - ":Support", ":TransformDialect", ":TransformDialectInterfaces", - "//llvm:Support", ], ) @@ -3249,7 +3219,6 @@ cc_library( ":BufferizationTransforms", ":ConversionPasses", ":FuncDialect", - ":FuncTransforms", ":GPUDialect", ":GPUToNVVMTransforms", ":GPUTransforms", @@ -3259,10 +3228,8 @@ cc_library( ":Pass", ":SparseTensorDialect", ":SparseTensorTransforms", - ":TensorTransforms", ":Transforms", ":VectorToLLVM", - ":VectorTransforms", ], ) @@ -3397,10 +3364,12 @@ cc_library( includes = ["include"], deps = [ ":ArithDialect", + ":BytecodeOpInterface", ":DialectUtils", ":IR", ":InferTypeOpInterface", ":MeshIncGen", + ":SideEffectInterfaces", ":Support", ":ViewLikeInterface", "//llvm:Support", @@ -3448,7 +3417,6 @@ cc_library( ":Pass", ":Support", ":TensorDialect", - ":TransformUtils", ":Transforms", "//llvm:Support", ], @@ -3548,12 +3516,12 @@ cc_library( hdrs = ["include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h"], includes = ["include"], deps = [ + ":BytecodeOpInterface", ":GPUDialect", ":IR", ":LLVMDialect", ":NVGPUIncGen", ":SideEffectInterfaces", - "//llvm:Core", "//llvm:Support", ], ) @@ -3573,8 +3541,6 @@ cc_library( ":ArithDialect", ":ArithUtils", ":DialectUtils", - ":GPUCommonTransforms", - ":GPUCompilationAttrInterfacesIncGen", ":GPUDialect", ":GPUToGPURuntimeTransforms", ":IR", @@ -3588,7 +3554,6 @@ cc_library( ":NVVMDialect", ":SCFDialect", ":SCFTransforms", - ":Support", ":TransformDialect", ":TransformDialectInterfaces", ":VectorDialect", @@ -3651,9 +3616,7 @@ cc_library( ]), includes = ["include"], deps = [ - ":AffineDialect", ":ArithDialect", - ":FuncDialect", ":GPUDialect", ":IR", ":MemRefDialect", @@ -3662,9 +3625,7 @@ cc_library( ":Pass", ":SideEffectInterfaces", ":Support", - ":Transforms", ":VectorDialect", - "//llvm:Core", "//llvm:Support", ], ) @@ -3772,7 +3733,6 @@ cc_library( ":SideEffectInterfaces", ":ViewLikeInterface", ":XeGPUIncGen", - "//llvm:Core", "//llvm:Support", ], ) @@ -3881,7 +3841,6 @@ cc_library( ]), includes = ["include"], deps = [ - ":ArithUtils", ":DialectUtilsIncGen", ":IR", ":Support", @@ -3902,15 +3861,12 @@ cc_library( ":AffineMemoryOpInterfacesIncGen", ":AffineOpsIncGen", ":ArithDialect", - ":BufferizationInterfaces", ":ControlFlowInterfaces", - ":DialectUtils", ":IR", ":InliningUtils", ":LoopLikeInterface", ":MemRefDialect", ":ShapedOpInterfaces", - ":SideEffectInterfaces", ":Support", ":UBDialect", ":ValueBoundsOpInterface", @@ -3928,7 +3884,7 @@ cc_library( ]), includes = ["include"], deps = [ - ":CallOpInterfaces", + ":BytecodeOpInterface", ":CastInterfaces", ":ControlFlowInterfaces", ":EmitCAttributesIncGen", @@ -3970,6 +3926,8 @@ cc_library( includes = ["include"], deps = [ ":AsyncOpsIncGen", + ":BytecodeOpInterface", + ":CallOpInterfaces", ":ControlFlowInterfaces", ":FunctionInterfaces", ":IR", @@ -4003,9 +3961,6 @@ cc_library( ":SCFToControlFlow", ":Support", ":TransformUtils", - ":Transforms", - ":TransformsPassIncGen", - "//llvm:Core", "//llvm:Support", ], ) @@ -4023,7 +3978,6 @@ cc_library( ":ArithDialect", ":CallOpInterfaces", ":DialectUtils", - ":FuncDialect", ":IR", ":SideEffectInterfaces", ":Support", @@ -4238,9 +4192,7 @@ cc_library( ":LLVMCommonConversion", ":LLVMDialect", ":Pass", - ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -4256,12 +4208,9 @@ cc_library( ":AffineDialect", ":AffineTransforms", ":AffineUtils", - ":ArithDialect", ":ConversionPassIncGen", - ":FuncDialect", ":IR", ":MemRefDialect", - ":Pass", ":SCFDialect", ":Support", ":TransformUtils", @@ -4290,7 +4239,6 @@ cc_library( ":ControlFlowDialect", ":ControlFlowInterfaces", ":DestinationStyleOpInterface", - ":FuncDialect", ":FunctionInterfaces", ":IR", ":InferTypeOpInterface", @@ -4298,10 +4246,9 @@ cc_library( ":LoopLikeInterface", ":MemRefDialect", ":ParallelCombiningOpInterface", - ":Pass", ":SCFDeviceMappingInterfacesIncGen", ":SCFIncGen", - ":SCFPassIncGen", + ":SideEffectInterfaces", ":Support", ":TensorDialect", ":ValueBoundsOpInterface", @@ -4335,7 +4282,6 @@ cc_library( ":SideEffectInterfaces", ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -4348,7 +4294,6 @@ cc_library( hdrs = ["include/mlir/Interfaces/Utils/InferIntRangeCommon.h"], includes = ["include"], deps = [ - ":IR", ":InferIntRangeInterface", "//llvm:Support", ], @@ -4399,7 +4344,6 @@ cc_library( deps = [ ":IR", ":MemorySlotInterfacesIncGen", - "//llvm:Support", ], ) @@ -4411,7 +4355,6 @@ cc_library( deps = [ ":IR", ":ShapedOpInterfacesIncGen", - "//llvm:Support", ], ) @@ -4423,7 +4366,6 @@ cc_library( deps = [ ":IR", ":ParallelCombiningOpInterfaceIncGen", - "//llvm:Support", ], ) @@ -4435,7 +4377,6 @@ cc_library( deps = [ ":IR", ":RuntimeVerifiableOpInterfaceIncGen", - "//llvm:Support", ], ) @@ -4549,11 +4490,12 @@ cc_library( deps = [ ":ArithDialect", ":BufferizationInterfaces", + ":BytecodeOpInterface", + ":CallOpInterfaces", ":CastInterfaces", ":CommonFolders", ":ControlFlowInterfaces", ":Dialect", - ":FuncDialect", ":FunctionInterfaces", ":IR", ":InferTypeOpInterface", @@ -4594,15 +4536,12 @@ cc_library( ":ConversionPassIncGen", ":FuncDialect", ":IR", - ":MemRefDialect", ":Pass", ":SCFDialect", ":ShapeDialect", ":ShapeToStandardGen", - ":Support", ":TensorDialect", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -4645,7 +4584,6 @@ cc_library( ":ShapeTransformsPassIncGen", ":TensorDialect", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -4718,7 +4656,6 @@ cc_library( ":ArithDialect", ":BufferizationInterfaces", ":BytecodeOpInterface", - ":CommonFolders", ":ControlFlowInterfaces", ":ControlFlowOpsIncGen", ":ConvertToLLVMInterface", @@ -4761,12 +4698,9 @@ cc_library( ]), includes = ["include"], deps = [ - ":ArithDialect", ":BufferizationInterfaces", + ":BytecodeOpInterface", ":CallOpInterfaces", - ":CastInterfaces", - ":CommonFolders", - ":ControlFlowDialect", ":ControlFlowInterfaces", ":ConvertToLLVMInterface", ":FuncIncGen", @@ -4789,7 +4723,6 @@ cc_library( ":ControlFlowDialect", ":FuncDialect", ":IR", - ":InferTypeOpInterface", ":InliningUtils", ":MeshShardingInterface", ], @@ -4836,6 +4769,7 @@ cc_library( ], includes = ["include"], deps = [ + ":BytecodeOpInterface", ":FuncDialect", ":FuncToLLVM", ":FuncTransformOpsIncGen", @@ -4919,10 +4853,8 @@ cc_library( ":IR", ":MemRefDialect", ":Pass", - ":SCFDialect", ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -4940,11 +4872,12 @@ cc_library( includes = ["include"], deps = [ ":AffineDialect", + ":Analysis", ":ArithDialect", ":ArithUtils", ":BufferizationInterfaces", + ":BytecodeOpInterface", ":ControlFlowInterfaces", - ":DataLayoutInterfaces", ":DestinationStyleOpInterface", ":DialectUtils", ":IR", @@ -4977,24 +4910,18 @@ cc_library( ], includes = ["include"], deps = [ - ":AffineDialect", - ":ArithDialect", - ":AsmParser", ":IR", ":LLVMCommonConversion", ":LLVMDialect", - ":SideEffectInterfaces", ":TransformDialect", ":TransformDialectInterfaces", ":TransformUtils", ":VectorDialect", - ":VectorEnumsIncGen", ":VectorToLLVM", ":VectorToSCF", ":VectorTransformOpsIncGen", ":VectorTransforms", ":X86VectorTransforms", - "//llvm:Support", ], ) @@ -5072,7 +4999,6 @@ cc_library( ":Support", ":TensorDialect", ":TransformUtils", - ":Transforms", ":VectorDialect", ":VectorEnumsIncGen", ":VectorInterfaces", @@ -5115,10 +5041,7 @@ cc_library( ]), hdrs = glob(["include/mlir/Support/*.h"]), includes = ["include"], - deps = [ - "//llvm:Support", - "//llvm:TargetParser", - ], + deps = ["//llvm:Support"], ) cc_library( @@ -5158,7 +5081,6 @@ cc_library( deps = [ ":Support", "//llvm:Support", - "//llvm:TargetParser", ], ) @@ -5233,7 +5155,6 @@ cc_library( ":IR", ":Support", "//llvm:Support", - "//llvm:TargetParser", ], ) @@ -5282,7 +5203,6 @@ cc_library( ":AsmParser", ":BytecodeReader", ":IR", - ":Support", "//llvm:Support", ], ) @@ -5357,6 +5277,7 @@ cc_library( ), includes = ["include"], deps = [ + ":BytecodeOpInterface", ":CallOpInterfaces", ":ControlFlowInterfaces", ":DataLayoutInterfaces", @@ -5406,14 +5327,12 @@ cc_library( includes = ["include"], deps = [ ":FuncDialect", - ":GPUDialect", ":IR", ":LLVMDialect", ":LLVMPassIncGen", ":NVVMDialect", ":Pass", ":TransformUtils", - ":Transforms", "//llvm:BinaryFormat", "//llvm:Support", ], @@ -5561,6 +5480,7 @@ cc_library( deps = [ ":ArithDialect", ":BufferizationInterfaces", + ":BytecodeOpInterface", ":ControlFlowInterfaces", ":DLTIDialect", ":FunctionInterfaces", @@ -5571,7 +5491,6 @@ cc_library( ":InferIntRangeInterface", ":InferTypeOpInterface", ":InliningUtils", - ":LLVMDialect", ":MemRefDialect", ":SCFDialect", ":SideEffectInterfaces", @@ -5620,7 +5539,6 @@ cc_library( ":ArithToLLVM", ":FuncDialect", ":FuncToLLVM", - ":GPUCommonTransforms", ":GPUDialect", ":GPUToGPURuntimeTransforms", ":GPUToNVVMTransforms", @@ -5742,11 +5660,8 @@ cc_library( deps = [ ":AffineDialect", ":ArithDialect", - ":AsmParser", - ":ControlFlowDialect", ":DialectUtils", ":FuncDialect", - ":GPUCommonTransforms", ":GPUDialect", ":GPUToGPURuntimeTransforms", ":GPUToNVVMTransforms", @@ -5756,9 +5671,7 @@ cc_library( ":LLVMCommonConversion", ":MemRefDialect", ":NVVMDialect", - ":Parser", ":SCFDialect", - ":SideEffectInterfaces", ":Support", ":TransformDialect", ":TransformDialectInterfaces", @@ -5802,15 +5715,10 @@ cc_library( "lib/Conversion/GPUCommon/OpToFuncCallLowering.h", ], deps = [ - ":ConversionPassIncGen", - ":FuncDialect", ":GPUDialect", - ":GPUTransforms", ":IR", ":LLVMCommonConversion", ":LLVMDialect", - ":Support", - "//llvm:Support", ], ) @@ -5841,7 +5749,6 @@ cc_library( ]), includes = ["include"], deps = [ - ":ArithDialect", ":ArithToLLVM", ":ControlFlowDialect", ":ControlFlowToLLVM", @@ -5860,11 +5767,8 @@ cc_library( ":MemRefDialect", ":MemRefToLLVM", ":NVVMDialect", - ":Pass", ":TransformUtils", - ":Transforms", ":VectorToLLVM", - "//llvm:Support", ], ) @@ -5886,8 +5790,6 @@ cc_library( ":LLVMDialect", ":Pass", ":ROCDLDialect", - ":Support", - ":Transforms", "//llvm:Support", ], ) @@ -5904,7 +5806,6 @@ cc_library( deps = [ ":ArithDialect", ":ConversionPassIncGen", - ":GPUCommonTransforms", ":GPUDialect", ":GPUToGPURuntimeTransforms", ":IR", @@ -5937,7 +5838,6 @@ cc_library( ":SPIRVDialect", ":Support", ":TransformUtils", - ":Transforms", ":VectorDialect", "//llvm:Support", ], @@ -5994,7 +5894,6 @@ cc_library( ":Transforms", ":VectorDialect", ":VectorToLLVM", - ":VectorToSCF", "//llvm:Support", ], ) @@ -6072,15 +5971,11 @@ cc_library( ":GPUDialect", ":IR", ":MemRefToSPIRV", - ":Pass", - ":SCFDialect", ":SCFToSPIRV", ":SPIRVConversion", ":SPIRVDialect", ":Support", ":TransformUtils", - ":Transforms", - ":VectorToSPIRV", "//llvm:Support", ], ) @@ -6132,7 +6027,6 @@ cc_library( ":SPIRVUtils", ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -6277,6 +6171,7 @@ cc_library( includes = ["include"], deps = [ ":BasicPtxBuilderInterface", + ":BytecodeOpInterface", ":ConvertToLLVMInterface", ":DialectUtils", ":GPUDialect", @@ -6501,13 +6396,10 @@ cc_library( ":ConversionPassIncGen", ":ConvertToLLVMInterface", ":FuncDialect", - ":GPUDialect", ":IR", ":LLVMCommonConversion", ":LLVMDialect", - ":MemRefDialect", ":NVVMDialect", - ":NVVMOpsIncGen", ":Pass", ":Support", "//llvm:Support", @@ -6520,6 +6412,7 @@ cc_library( hdrs = ["include/mlir/Dialect/LLVMIR/ROCDLDialect.h"], includes = ["include"], deps = [ + ":BytecodeOpInterface", ":GPUDialect", ":IR", ":LLVMDialect", @@ -6708,13 +6601,13 @@ cc_library( ]), includes = ["include"], deps = [ + ":BytecodeOpInterface", ":FunctionInterfaces", ":IR", ":InferTypeOpInterface", ":PDLDialect", ":PDLInterpOpsIncGen", ":SideEffectInterfaces", - "//llvm:Support", ], ) @@ -6914,6 +6807,7 @@ cc_library( ]), includes = ["include"], deps = [ + ":BytecodeOpInterface", ":CallOpInterfaces", ":CommonFolders", ":ControlFlowInterfaces", @@ -6928,10 +6822,8 @@ cc_library( ":SPIRVAvailabilityIncGen", ":SPIRVCanonicalizationIncGen", ":SPIRVOpsIncGen", - ":SPIRVSerializationGen", ":SideEffectInterfaces", ":Support", - ":Transforms", ":UBDialect", "//llvm:Support", ], @@ -6974,10 +6866,7 @@ cc_library( "include/mlir/Dialect/SPIRV/Utils/*.h", ]), includes = ["include"], - deps = [ - ":SPIRVDialect", - "//llvm:Support", - ], + deps = [":SPIRVDialect"], ) cc_library( @@ -7017,7 +6906,6 @@ cc_library( ":SPIRVUtils", ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -7030,7 +6918,6 @@ cc_library( ":IR", ":SPIRVDialect", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -7055,9 +6942,7 @@ cc_library( ":SPIRVCommonConversion", ":SPIRVConversion", ":SPIRVDialect", - ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -7078,12 +6963,8 @@ cc_library( ":ConversionPassIncGen", ":EmitCDialect", ":FuncDialect", - ":IR", ":Pass", - ":Support", ":TransformUtils", - ":Transforms", - "//llvm:Support", ], ) @@ -7100,21 +6981,16 @@ cc_library( "lib/Conversion/FuncToSPIRV", ], deps = [ - ":ControlFlowToSPIRV", ":ConversionPassIncGen", ":FuncDialect", ":IR", - ":MathToSPIRV", ":Pass", ":SPIRVCommonConversion", ":SPIRVConversion", ":SPIRVDialect", ":SPIRVUtils", ":Support", - ":TensorDialect", ":TransformUtils", - ":Transforms", - ":VectorDialect", "//llvm:Support", ], ) @@ -7134,7 +7010,6 @@ cc_library( deps = [ ":ArithDialect", ":ConversionPassIncGen", - ":FuncDialect", ":IR", ":LinalgDialect", ":LinalgTransforms", @@ -7142,8 +7017,6 @@ cc_library( ":Support", ":TensorDialect", ":TransformUtils", - ":Transforms", - ":VectorDialect", "//llvm:Support", ], ) @@ -7162,12 +7035,9 @@ cc_library( ], deps = [ ":ArithToSPIRV", - ":ControlFlowToSPIRV", ":ConversionPassIncGen", - ":FuncDialect", ":FuncToSPIRV", ":IR", - ":MathToSPIRV", ":Pass", ":SPIRVCommonConversion", ":SPIRVConversion", @@ -7176,8 +7046,6 @@ cc_library( ":Support", ":TensorDialect", ":TransformUtils", - ":Transforms", - ":VectorDialect", "//llvm:Support", ], ) @@ -7188,11 +7056,8 @@ cc_library( hdrs = ["include/mlir/Target/SPIRV/SPIRVBinaryUtils.h"], includes = ["include"], deps = [ - ":SPIRVAttrUtilsGen", ":SPIRVDialect", - ":SPIRVOpsIncGen", ":Support", - "//llvm:Support", ], ) @@ -7208,13 +7073,10 @@ cc_library( includes = ["include"], deps = [ ":IR", - ":SPIRVAttrUtilsGen", ":SPIRVBinaryUtils", ":SPIRVDialect", - ":SPIRVOpsIncGen", ":SPIRVSerializationGen", ":Support", - ":Transforms", "//llvm:Support", ], ) @@ -7229,13 +7091,10 @@ cc_library( includes = ["include"], deps = [ ":IR", - ":SPIRVAttrUtilsGen", ":SPIRVBinaryUtils", ":SPIRVDialect", - ":SPIRVOpsIncGen", ":SPIRVSerializationGen", ":Support", - ":Transforms", "//llvm:Support", ], ) @@ -7376,7 +7235,6 @@ cc_library( ":IR", ":InferTypeOpInterface", ":TensorDialect", - "//llvm:Support", ], ) @@ -7398,7 +7256,6 @@ cc_library( ":TensorUtils", ":TilingInterface", ":ValueBoundsOpInterface", - "//llvm:Support", ], ) @@ -7414,7 +7271,6 @@ cc_library( ":DialectUtils", ":TensorDialect", ":ValueBoundsOpInterface", - "//llvm:Support", ], ) @@ -7465,7 +7321,6 @@ cc_library( ":TensorUtils", ":TilingInterface", ":TransformUtils", - ":Transforms", ":ValueBoundsOpInterface", ":VectorDialect", "//llvm:Support", @@ -7570,13 +7425,11 @@ cc_library( ":IR", ":InliningUtils", ":LoopLikeInterface", - ":MemorySlotInterfaces", ":Pass", ":Rewrite", ":SideEffectInterfaces", ":SubsetOpInterface", ":Support", - ":TransformsPassIncGen", ":config", "//llvm:Support", ], @@ -7607,7 +7460,6 @@ cc_library( deps = [ ":DerivedAttributeOpInterfaceIncGen", ":IR", - "//llvm:Support", ], ) @@ -7665,7 +7517,6 @@ cc_library( deps = [ ":IR", ":InferIntRangeInterfaceIncGen", - "//llvm:Support", ], ) @@ -7907,17 +7758,14 @@ cc_library( ":ControlFlowInterfaces", ":FunctionInterfaces", ":IR", - ":InliningUtils", ":LoopLikeInterface", ":MemorySlotInterfaces", ":Pass", - ":Rewrite", ":RuntimeVerifiableOpInterface", ":SideEffectInterfaces", ":Support", ":TransformUtils", ":TransformsPassIncGen", - ":config", "//llvm:Support", ], ) @@ -7945,7 +7793,6 @@ cc_library( ":ArithDialect", ":ComplexDialect", ":ConversionPassIncGen", - ":FuncDialect", ":FunctionInterfaces", ":GPUDialect", ":GPUTransforms", @@ -7968,16 +7815,12 @@ cc_library( includes = ["include"], deps = [ ":ArithDialect", - ":ControlFlowInterfaces", ":ConversionPassIncGen", ":EmitCDialect", ":IR", - ":Pass", ":SCFDialect", - ":Support", ":TransformUtils", ":Transforms", - "//llvm:Support", ], ) @@ -7991,10 +7834,8 @@ cc_library( ]), includes = ["include"], deps = [ - ":AffineDialect", ":ArithToSPIRV", ":ConversionPassIncGen", - ":FuncDialect", ":FuncToSPIRV", ":IR", ":IndexToSPIRV", @@ -8003,9 +7844,7 @@ cc_library( ":SCFDialect", ":SPIRVConversion", ":SPIRVDialect", - ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -8022,16 +7861,13 @@ cc_library( ":Analysis", ":ArithDialect", ":ConversionPassIncGen", - ":FuncDialect", ":IR", ":LLVMDialect", ":MemRefDialect", ":OpenMPDialect", ":Pass", ":SCFDialect", - ":Support", ":TransformUtils", - ":Transforms", ], ) @@ -8046,12 +7882,8 @@ cc_library( ":ArithDialect", ":ControlFlowDialect", ":ConversionPassIncGen", - ":FuncDialect", ":IR", - ":LLVMDialect", - ":Pass", ":SCFDialect", - ":Support", ":TransformUtils", ":Transforms", ], @@ -8114,9 +7946,7 @@ cc_library( ":LLVMDialect", ":Pass", ":Rewrite", - ":Support", ":TransformUtils", - "//llvm:Support", ], ) @@ -8174,10 +8004,7 @@ cc_library( ":Pass", ":SCFDialect", ":TransformUtils", - ":Transforms", ":UBDialect", - "//llvm:Core", - "//llvm:Support", ], ) @@ -8191,24 +8018,14 @@ cc_library( ], includes = ["include"], deps = [ - ":Analysis", - ":ArithToLLVM", ":ControlFlowDialect", ":ConversionPassIncGen", ":ConvertToLLVMInterface", - ":DataLayoutInterfaces", - ":DialectUtils", ":IR", ":LLVMCommonConversion", ":LLVMDialect", - ":MathDialect", - ":MemRefDialect", - ":Parser", ":Pass", - ":Support", ":TransformUtils", - ":Transforms", - "//llvm:Core", "//llvm:Support", ], ) @@ -8229,7 +8046,6 @@ cc_library( ":SPIRVUtils", ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -8252,10 +8068,7 @@ cc_library( ":IR", ":MemRefDialect", ":Pass", - ":Support", ":TransformUtils", - ":Transforms", - "//llvm:Support", ], ) @@ -8269,7 +8082,6 @@ cc_library( ":ArithDialect", ":ConversionPassIncGen", ":ConvertToLLVMInterface", - ":DataLayoutInterfaces", ":FuncDialect", ":IR", ":LLVMCommonConversion", @@ -8278,7 +8090,6 @@ cc_library( ":MemRefUtils", ":Pass", ":Support", - ":Transforms", "//llvm:Support", ], ) @@ -8298,7 +8109,6 @@ cc_library( deps = [ ":ArithDialect", ":ConversionPassIncGen", - ":FuncDialect", ":FunctionInterfaces", ":IR", ":MemRefDialect", @@ -8307,7 +8117,6 @@ cc_library( ":SPIRVDialect", ":Support", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -8333,13 +8142,10 @@ cc_library( ":ArithDialect", ":ArithUtils", ":ConversionPassIncGen", - ":ConvertToLLVMInterface", ":IR", - ":LLVMDialect", ":Pass", ":Support", ":TransformUtils", - ":Transforms", ":VectorDialect", ], ) @@ -8357,11 +8163,8 @@ cc_library( ":ArithDialect", ":ArmSMEDialect", ":ConversionPassIncGen", - ":IR", ":Pass", ":TransformUtils", - ":Transforms", - "//llvm:Support", ], ) @@ -8381,12 +8184,8 @@ cc_library( ":ArithDialect", ":ConversionPassIncGen", ":EmitCDialect", - ":IR", ":Pass", - ":Support", ":TransformUtils", - ":Transforms", - "//llvm:Support", ], ) @@ -8396,7 +8195,6 @@ cc_library( hdrs = glob(["include/mlir/Conversion/ArithToLLVM/*.h"]), includes = ["include"], deps = [ - ":Analysis", ":ArithAttrToLLVMConversion", ":ArithDialect", ":ConversionPassIncGen", @@ -8405,8 +8203,6 @@ cc_library( ":LLVMCommonConversion", ":LLVMDialect", ":Pass", - ":Support", - ":Transforms", ], ) @@ -8418,14 +8214,11 @@ cc_library( deps = [ ":ArithDialect", ":ConversionPassIncGen", - ":FuncToSPIRV", ":IR", ":Pass", ":SPIRVCommonConversion", ":SPIRVConversion", ":SPIRVDialect", - ":Support", - ":Transforms", "//llvm:Support", ], ) @@ -8436,18 +8229,14 @@ cc_library( hdrs = glob(["include/mlir/Conversion/MathToLLVM/*.h"]), includes = ["include"], deps = [ - ":Analysis", ":ArithAttrToLLVMConversion", ":ConversionPassIncGen", ":ConvertToLLVMInterface", - ":DataLayoutInterfaces", ":IR", ":LLVMCommonConversion", ":LLVMDialect", ":MathDialect", ":Pass", - ":Support", - ":Transforms", ], ) @@ -8468,7 +8257,6 @@ cc_library( ":Pass", ":SCFDialect", ":TransformUtils", - ":Transforms", ":VectorDialect", ":VectorUtils", "//llvm:Support", @@ -8563,7 +8351,6 @@ cc_library( deps = [ ":CastInterfacesIncGen", ":IR", - "//llvm:Support", ], ) @@ -8915,8 +8702,6 @@ cc_library( deps = [ ":IR", ":ToLLVMIRTranslation", - "//llvm:Core", - "//llvm:Support", ], ) @@ -9184,7 +8969,6 @@ cc_library( includes = ["include"], deps = [ ":IR", - ":Pass", ":Support", "//llvm:Support", ], @@ -9196,7 +8980,6 @@ cc_library( hdrs = ["include/mlir/Tools/mlir-opt/MlirOptMain.h"], includes = ["include"], deps = [ - ":BytecodeReader", ":BytecodeWriter", ":Debug", ":IR", @@ -9619,7 +9402,6 @@ cc_library( deps = [ ":mlir_c_runner_utils", ":mlir_float16_utils", - "//llvm:Support", ], ) @@ -9728,13 +9510,7 @@ cc_library( "manual", # External dependency ], deps = [ - ":FuncDialect", - ":IR", - ":Pass", - ":SPIRVDialect", - ":SideEffectInterfaces", ":Support", - "//llvm:Support", "@vulkan_headers", "@vulkan_sdk//:sdk", ], @@ -9914,7 +9690,6 @@ cc_library( ":AtomicInterfacesIncGen", ":ControlFlowInterfaces", ":IR", - "//llvm:Support", ], ) @@ -10099,6 +9874,7 @@ cc_library( deps = [ ":AtomicInterfaces", ":AtomicInterfacesIncGen", + ":BytecodeOpInterface", ":ControlFlowInterfaces", ":IR", ":LLVMDialect", @@ -10108,8 +9884,8 @@ cc_library( ":OpenACCOpsInterfacesIncGen", ":OpenACCTypeInterfacesIncGen", ":OpenACCTypesIncGen", + ":SideEffectInterfaces", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -10141,7 +9917,6 @@ cc_library( includes = ["include"], deps = [ ":FuncDialect", - ":LLVMIRTransforms", ":OpenACCDialect", ":OpenACCPassIncGen", ":Pass", @@ -10323,6 +10098,7 @@ cc_library( ":OpenMPInterfacesIncGen", ":OpenMPOpsIncGen", ":OpenMPTypeInterfacesIncGen", + ":SideEffectInterfaces", ":Support", "//llvm:FrontendOpenMP", "//llvm:Support", @@ -10341,15 +10117,11 @@ cc_library( deps = [ ":ArithDialect", ":ConversionPassIncGen", - ":FuncDialect", ":IR", ":OpenACCDialect", - ":OpenACCOpsIncGen", - ":OpenACCTypesIncGen", ":Pass", ":SCFDialect", ":TransformUtils", - ":Transforms", ], ) @@ -10366,17 +10138,12 @@ cc_library( ":ArithToLLVM", ":ControlFlowToLLVM", ":ConversionPassIncGen", - ":FuncDialect", ":FuncToLLVM", - ":IR", ":LLVMCommonConversion", ":LLVMDialect", ":MemRefToLLVM", ":OpenMPDialect", ":Pass", - ":Transforms", - "//llvm:Core", - "//llvm:Support", ], ) @@ -10461,17 +10228,13 @@ cc_library( ], includes = ["include"], deps = [ - ":ArithDialect", ":BytecodeOpInterface", - ":FuncDialect", ":IR", ":InferTypeOpInterface", - ":Pass", ":QuantDialectBytecodeGen", ":QuantOpsIncGen", ":SideEffectInterfaces", ":Support", - ":TransformUtils", "//llvm:Support", ], ) @@ -10573,18 +10336,12 @@ cc_library( ]), includes = ["include"], deps = [ - ":Analysis", ":ConversionPassIncGen", ":ConvertToLLVMInterface", - ":IR", ":IndexDialect", ":LLVMCommonConversion", ":LLVMDialect", ":Pass", - ":Support", - ":Transforms", - "//llvm:Core", - "//llvm:Support", ], ) @@ -10599,16 +10356,11 @@ cc_library( includes = ["include"], deps = [ ":ConversionPassIncGen", - ":IR", ":IndexDialect", ":Pass", ":SPIRVCommonConversion", ":SPIRVConversion", ":SPIRVDialect", - ":Support", - ":Transforms", - "//llvm:Core", - "//llvm:Support", ], ) @@ -10618,6 +10370,7 @@ cc_library( hdrs = glob(["include/mlir/Dialect/Index/IR/*.h"]), includes = ["include"], deps = [ + ":BytecodeOpInterface", ":CastInterfaces", ":ConvertToLLVMInterface", ":IR", @@ -10626,6 +10379,7 @@ cc_library( ":InferIntRangeCommon", ":InferIntRangeInterface", ":InferTypeOpInterface", + ":SideEffectInterfaces", "//llvm:Support", ], ) @@ -10979,9 +10733,7 @@ cc_library( ":DestinationStyleOpInterface", ":IR", ":SubsetOpInterfaceIncGen", - ":Support", ":ValueBoundsOpInterface", - "//llvm:Support", ], ) @@ -11021,18 +10773,13 @@ cc_library( ":AffineDialect", ":ConversionPassIncGen", ":FuncDialect", - ":IR", ":LLVMDialect", ":LinalgDialect", ":LinalgTransforms", ":MemRefDialect", ":Pass", ":SCFDialect", - ":Support", ":TransformUtils", - ":Transforms", - "//llvm:Core", - "//llvm:Support", ], ) @@ -11046,7 +10793,6 @@ cc_library( ":ArithDialect", ":ArithUtils", ":AsmParser", - ":BufferizationDialect", ":BufferizationInterfaces", ":BytecodeOpInterface", ":ComplexDialect", @@ -11054,7 +10800,6 @@ cc_library( ":CopyOpInterface", ":DestinationStyleOpInterface", ":DialectUtils", - ":FuncDialect", ":FunctionInterfaces", ":IR", ":InferTypeOpInterface", @@ -11201,7 +10946,6 @@ cc_library( ":AffineUtils", ":Analysis", ":ArithDialect", - ":ArithTransforms", ":ArithUtils", ":BufferizationDialect", ":BufferizationInterfaces", @@ -11218,9 +10962,7 @@ cc_library( ":LinalgPassIncGen", ":LinalgStructuredOpsIncGen", ":LinalgUtils", - ":LoopLikeInterface", ":MaskableOpInterface", - ":MathDialect", ":MemRefDialect", ":MemRefTransforms", ":MeshDialect", @@ -11239,7 +10981,6 @@ cc_library( ":TensorUtils", ":TilingInterface", ":TransformUtils", - ":Transforms", ":ValueBoundsOpInterface", ":VectorDialect", ":VectorToSCF", @@ -11289,7 +11030,6 @@ cc_library( ":Analysis", ":DestinationStyleOpInterface", ":IR", - ":Support", ":ValueBoundsOpInterfaceIncGen", ":ViewLikeInterface", "//llvm:Support", @@ -11303,7 +11043,6 @@ cc_library( includes = ["include"], deps = [ ":ArithDialect", - ":IR", ":ValueBoundsOpInterface", ], ) @@ -11319,7 +11058,6 @@ cc_library( ":Support", ":TilingInterfaceIncGen", ":ViewLikeInterface", - "//llvm:Support", ], ) @@ -11516,8 +11254,6 @@ cc_library( ":IR", ":MaskableOpInterfaceIncGen", ":MaskingOpInterface", - ":Support", - "//llvm:Support", ], ) @@ -11529,8 +11265,6 @@ cc_library( deps = [ ":IR", ":MaskingOpInterfaceIncGen", - ":Support", - "//llvm:Support", ], ) @@ -11550,12 +11284,9 @@ cc_library( ":ArithDialect", ":ArithUtils", ":ArmNeonDialect", - ":ArmSMEDialect", - ":ArmSMETransforms", ":ArmSVEDialect", ":ArmSVETransforms", ":ConversionPassIncGen", - ":DialectUtils", ":FuncDialect", ":IR", ":LLVMCommonConversion", @@ -11563,15 +11294,12 @@ cc_library( ":MaskableOpInterface", ":MemRefDialect", ":Pass", - ":Support", ":ToLLVMIRTranslation", ":TransformUtils", - ":Transforms", ":VectorDialect", ":VectorTransforms", ":X86VectorDialect", ":X86VectorTransforms", - "//llvm:Core", "//llvm:Support", ], ) @@ -11592,7 +11320,6 @@ cc_library( ":MemRefDialect", ":Pass", ":TransformUtils", - ":Transforms", "//llvm:Support", ], ) @@ -11612,15 +11339,11 @@ cc_library( ":ArithDialect", ":ConversionPassIncGen", ":DialectUtils", - ":FuncDialect", - ":FuncToLLVM", ":GPUDialect", ":IR", - ":LLVMDialect", ":MemRefDialect", ":NVGPUDialect", ":NVGPUUtils", - ":NVVMDialect", ":Pass", ":SCFDialect", ":Support", @@ -11629,7 +11352,6 @@ cc_library( ":VectorDialect", ":VectorTransforms", ":VectorUtils", - "//llvm:Core", "//llvm:Support", ], ) @@ -11645,24 +11367,17 @@ cc_library( includes = ["include"], deps = [ ":AffineDialect", - ":AffineUtils", ":ArithDialect", ":ConversionPassIncGen", - ":FuncDialect", - ":FuncToLLVM", ":IR", - ":LLVMDialect", ":MemRefDialect", ":Pass", ":SCFDialect", - ":Support", ":TensorDialect", ":TransformUtils", ":Transforms", ":VectorDialect", ":VectorTransforms", - "//llvm:Core", - "//llvm:Support", ], ) @@ -11798,8 +11513,8 @@ cc_library( ]), includes = ["include"], deps = [ - ":Analysis", ":ArithDialect", + ":BytecodeOpInterface", ":Dialect", ":DialectUtils", ":FuncDialect", @@ -11811,6 +11526,7 @@ cc_library( ":MeshShardingInterface", ":Pass", ":QuantOps", + ":SideEffectInterfaces", ":Support", ":TensorDialect", ":TosaDialectBytecodeGen", @@ -11838,12 +11554,10 @@ cc_library( deps = [ ":ArithDialect", ":ConversionPassIncGen", - ":FuncDialect", ":IR", ":Pass", ":TosaDialect", ":TransformUtils", - ":Transforms", ], ) @@ -11867,7 +11581,6 @@ cc_library( ":FuncDialect", ":IR", ":LinalgDialect", - ":LinalgUtils", ":MathDialect", ":Pass", ":SCFDialect", @@ -11894,13 +11607,11 @@ cc_library( ], deps = [ ":ConversionPassIncGen", - ":FuncDialect", ":IR", ":MLProgramDialect", ":Pass", ":TosaDialect", ":TransformUtils", - ":Transforms", ], ) @@ -11925,7 +11636,6 @@ cc_library( ":TensorDialect", ":TosaDialect", ":TransformUtils", - ":Transforms", ], ) @@ -11945,14 +11655,12 @@ cc_library( ":ArithDialect", ":ArithUtils", ":ConversionPassIncGen", - ":FuncDialect", ":IR", ":Pass", ":TensorDialect", ":TensorUtils", ":TosaDialect", ":TransformUtils", - ":Transforms", ], ) @@ -12354,7 +12062,6 @@ cc_library( hdrs = glob(["include/mlir/Dialect/Transform/Utils/*.h"]), includes = ["include"], deps = [ - ":DialectUtils", ":IR", ":Support", ":ViewLikeInterface", @@ -12475,15 +12182,9 @@ cc_library( ":ComplexDialect", ":ConversionPassIncGen", ":ConvertToLLVMInterface", - ":FuncDialect", - ":IR", ":LLVMCommonConversion", ":LLVMDialect", ":Pass", - ":Support", - ":Transforms", - "//llvm:Core", - "//llvm:Support", ], ) @@ -12499,15 +12200,10 @@ cc_library( deps = [ ":ComplexDialect", ":ConversionPassIncGen", - ":DialectUtils", ":FuncDialect", ":IR", ":Pass", - ":Support", ":TransformUtils", - ":Transforms", - "//llvm:Core", - "//llvm:Support", ], ) @@ -12523,15 +12219,10 @@ cc_library( deps = [ ":ComplexDialect", ":ConversionPassIncGen", - ":IR", ":Pass", - ":SPIRVCommonConversion", ":SPIRVConversion", ":SPIRVDialect", - ":Support", ":TransformUtils", - ":Transforms", - "//llvm:Core", "//llvm:Support", ], ) @@ -12549,12 +12240,10 @@ cc_library( ":ArithDialect", ":ComplexDialect", ":ConversionPassIncGen", - ":FuncDialect", ":IR", ":MathDialect", ":Pass", ":TransformUtils", - ":Transforms", ], ) @@ -12774,13 +12463,11 @@ cc_library( ":FuncDialect", ":FuncTransforms", ":IR", - ":InferIntRangeInterface", ":MemRefDialect", ":Pass", ":Support", ":TensorDialect", ":TransformUtils", - ":Transforms", ":ValueBoundsOpInterface", ":VectorDialect", "//llvm:Support", @@ -12898,7 +12585,6 @@ cc_library( ":SideEffectInterfaces", ":UBDialect", ":VectorInterfaces", - "//llvm:Support", ], ) @@ -12918,7 +12604,6 @@ cc_library( ":Pass", ":SCFDialect", ":TransformUtils", - ":Transforms", ":VectorDialect", ":VectorUtils", ":X86VectorDialect", @@ -12944,13 +12629,8 @@ cc_library( ":LLVMDialect", ":MathDialect", ":Pass", - ":Support", ":TransformUtils", - ":Transforms", ":VectorDialect", - ":VectorUtils", - "//llvm:Core", - "//llvm:Support", ], ) @@ -13033,11 +12713,9 @@ cc_library( ":AllocationOpInterface", ":ArithDialect", ":ArithUtils", - ":BufferizationInterfaces", ":BytecodeOpInterface", ":CallOpInterfaces", ":CastInterfaces", - ":ComplexDialect", ":ControlFlowInterfaces", ":ConvertToLLVMInterface", ":CopyOpInterface", @@ -13055,7 +12733,6 @@ cc_library( ":ValueBoundsOpInterface", ":ViewLikeInterface", "//llvm:Support", - "//llvm:TargetParser", ], ) @@ -13113,7 +12790,6 @@ cc_library( ":ArithUtils", ":BufferizationDialect", ":BufferizationInterfaces", - ":BufferizationTransforms", ":ControlFlowDialect", ":DialectUtils", ":FuncDialect", @@ -13131,7 +12807,6 @@ cc_library( ":Support", ":TensorDialect", ":TransformUtils", - ":Transforms", ":ValueBoundsOpInterface", ":VectorDialect", "//llvm:Support", @@ -13177,6 +12852,7 @@ cc_library( ":AffineDialect", ":Analysis", ":ArithDialect", + ":BytecodeOpInterface", ":IR", ":LLVMCommonConversion", ":LoopLikeInterface", @@ -13309,10 +12985,7 @@ cc_library( ":MLProgramAttributesIncGen", ":MLProgramOpsIncGen", ":MLProgramTypesIncGen", - ":Pass", ":SideEffectInterfaces", - ":Support", - ":Transforms", "//llvm:Support", ], ) @@ -13327,7 +13000,6 @@ cc_library( ]), includes = ["include"], deps = [ - ":BufferizationDialect", ":BufferizationInterfaces", ":FuncDialect", ":IR", @@ -13447,14 +13119,12 @@ cc_library( hdrs = glob(["include/mlir/Dialect/MPI/IR/*.h"]), includes = ["include"], deps = [ - ":Dialect", + ":BytecodeOpInterface", ":IR", - ":InferTypeOpInterface", ":MPIAttrsIncGen", ":MPIIncGen", ":MPIOpsIncGen", ":MPITypesIncGen", - ":SideEffectInterfaces", "//llvm:Support", ], ) @@ -13616,16 +13286,14 @@ cc_library( ":BufferizationInterfaces", ":BufferizationTransformOpsIncGen", ":BufferizationTransforms", + ":BytecodeOpInterface", ":FunctionInterfaces", ":IR", ":LinalgDialect", ":MemRefDialect", - ":Parser", - ":SideEffectInterfaces", ":TensorDialect", ":TransformDialect", ":TransformDialectInterfaces", - "//llvm:Support", ], ) @@ -13689,7 +13357,6 @@ cc_library( deps = [ ":AffineDialect", ":AllocationOpInterface", - ":Analysis", ":ArithDialect", ":BufferDeallocationOpInterfaceIncGen", ":BufferViewFlowOpInterfaceIncGen", @@ -13698,7 +13365,6 @@ cc_library( ":BufferizationInterfaces", ":BufferizationOpsIncGen", ":BytecodeOpInterface", - ":CallOpInterfaces", ":ControlFlowInterfaces", ":CopyOpInterface", ":DestinationStyleOpInterface", @@ -13710,7 +13376,6 @@ cc_library( ":MemRefDialect", ":SparseTensorDialect", ":SubsetOpInterface", - ":Support", ":TensorDialect", "//llvm:Support", ], @@ -13791,7 +13456,6 @@ cc_library( ":SCFDialect", ":Support", ":TransformUtils", - ":Transforms", ], ) @@ -13801,9 +13465,7 @@ cc_library( hdrs = ["include/mlir/Dialect/Bufferization/Pipelines/Passes.h"], includes = ["include"], deps = [ - ":BufferizationDialect", ":BufferizationInterfaces", - ":BufferizationToMemRef", ":BufferizationTransforms", ":FuncDialect", ":MemRefTransforms", @@ -13937,7 +13599,6 @@ cc_library( deps = [ ":Support", "//llvm:Support", - "//llvm:TargetParser", ], ) @@ -13954,7 +13615,6 @@ cc_library( deps = [ ":Support", "//llvm:Support", - "//llvm:TargetParser", ], ) @@ -14109,6 +13769,7 @@ cc_library( hdrs = ["include/mlir/Dialect/UB/IR/UBOps.h"], includes = ["include"], deps = [ + ":BytecodeOpInterface", ":ConvertToLLVMInterface", ":IR", ":InliningUtils", @@ -14154,7 +13815,6 @@ cc_library( ":SPIRVConversion", ":SPIRVDialect", ":UBDialect", - "//llvm:Core", ], ) @@ -14164,6 +13824,7 @@ cc_library( hdrs = ["include/mlir/Dialect/LLVMIR/VCIXDialect.h"], includes = ["include"], deps = [ + ":BytecodeOpInterface", ":GPUDialect", ":IR", ":LLVMDialect",