diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 5e349cd69fb43..295558c0d3af8 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1371,20 +1371,13 @@ class MCPlusBuilder { /// Return true if \p Inst has RestoreState annotation. bool hasRestoreState(const MCInst &Inst) const; - /// Stores RA Signed annotation on \p Inst. - void setRASigned(MCInst &Inst) const; + /// Sets kRASigned or kRAUnsigned annotation on \p Inst. + /// Fails if \p Inst has either annotation already set. + void setRAState(MCInst &Inst, bool State) const; - /// Return true if \p Inst has Signed RA annotation. - bool isRASigned(const MCInst &Inst) const; - - /// Stores RA Unsigned annotation on \p Inst. - void setRAUnsigned(MCInst &Inst) const; - - /// Return true if \p Inst has Unsigned RA annotation. - bool isRAUnsigned(const MCInst &Inst) const; - - /// Return true if \p Inst doesn't have any annotation related to RA state. - bool isRAStateUnknown(const MCInst &Inst) const; + /// Return true if \p Inst has kRASigned annotation, false if it has + /// kRAUnsigned annotation, and std::nullopt if neither annotation is set. + std::optional getRAState(const MCInst &Inst) const; /// Return true if the instruction is a call with an exception handling info. virtual bool isInvoke(const MCInst &Inst) const { diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index e96de80bfa701..0cb4ba1ebfbd7 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -186,26 +186,21 @@ bool MCPlusBuilder::hasRestoreState(const MCInst &Inst) const { return hasAnnotation(Inst, MCAnnotation::kRestoreState); } -void MCPlusBuilder::setRASigned(MCInst &Inst) const { +void MCPlusBuilder::setRAState(MCInst &Inst, bool State) const { assert(!hasAnnotation(Inst, MCAnnotation::kRASigned)); - setAnnotationOpValue(Inst, MCAnnotation::kRASigned, true); -} - -bool MCPlusBuilder::isRASigned(const MCInst &Inst) const { - return hasAnnotation(Inst, MCAnnotation::kRASigned); -} - -void MCPlusBuilder::setRAUnsigned(MCInst &Inst) const { assert(!hasAnnotation(Inst, MCAnnotation::kRAUnsigned)); - setAnnotationOpValue(Inst, MCAnnotation::kRAUnsigned, true); + if (State) + setAnnotationOpValue(Inst, MCAnnotation::kRASigned, true); + else + setAnnotationOpValue(Inst, MCAnnotation::kRAUnsigned, true); } -bool MCPlusBuilder::isRAUnsigned(const MCInst &Inst) const { - return hasAnnotation(Inst, MCAnnotation::kRAUnsigned); -} - -bool MCPlusBuilder::isRAStateUnknown(const MCInst &Inst) const { - return !(isRAUnsigned(Inst) || isRASigned(Inst)); +std::optional MCPlusBuilder::getRAState(const MCInst &Inst) const { + if (hasAnnotation(Inst, MCAnnotation::kRASigned)) + return true; + if (hasAnnotation(Inst, MCAnnotation::kRAUnsigned)) + return false; + return std::nullopt; } std::optional MCPlusBuilder::getEHInfo(const MCInst &Inst) const { diff --git a/bolt/lib/Passes/InsertNegateRAStatePass.cpp b/bolt/lib/Passes/InsertNegateRAStatePass.cpp index 33664e1160a7b..775b7795e77c5 100644 --- a/bolt/lib/Passes/InsertNegateRAStatePass.cpp +++ b/bolt/lib/Passes/InsertNegateRAStatePass.cpp @@ -21,7 +21,12 @@ using namespace llvm; namespace llvm { namespace bolt { +static bool PassFailed = false; + void InsertNegateRAState::runOnFunction(BinaryFunction &BF) { + if (PassFailed) + return; + BinaryContext &BC = BF.getBinaryContext(); if (BF.getState() == BinaryFunction::State::Empty) @@ -39,7 +44,7 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) { for (FunctionFragment &FF : BF.getLayout().fragments()) { coverFunctionFragmentStart(BF, FF); bool FirstIter = true; - MCInst PrevInst; + bool PrevRAState = false; // As this pass runs after function splitting, we should only check // consecutive instructions inside FunctionFragments. for (BinaryBasicBlock *BB : FF) { @@ -47,18 +52,23 @@ void InsertNegateRAState::runOnFunction(BinaryFunction &BF) { MCInst &Inst = *It; if (BC.MIB->isCFI(Inst)) continue; + auto RAState = BC.MIB->getRAState(Inst); + if (!RAState) { + BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates " + << " in function " << BF.getPrintName() << "\n"; + PassFailed = true; + return; + } if (!FirstIter) { // Consecutive instructions with different RAState means we need to // add a OpNegateRAState. - if ((BC.MIB->isRASigned(PrevInst) && BC.MIB->isRAUnsigned(Inst)) || - (BC.MIB->isRAUnsigned(PrevInst) && BC.MIB->isRASigned(Inst))) { + if (*RAState != PrevRAState) It = BF.addCFIInstruction( BB, It, MCCFIInstruction::createNegateRAState(nullptr)); - } } else { FirstIter = false; } - PrevInst = *It; + PrevRAState = *RAState; } } } @@ -81,10 +91,17 @@ void InsertNegateRAState::coverFunctionFragmentStart(BinaryFunction &BF, }); // If a function is already split in the input, the first FF can also start // with Signed state. This covers that scenario as well. - if (BC.MIB->isRASigned(*((*FirstNonEmpty)->begin()))) { - BF.addCFIInstruction(*FirstNonEmpty, (*FirstNonEmpty)->begin(), - MCCFIInstruction::createNegateRAState(nullptr)); + auto II = (*FirstNonEmpty)->getFirstNonPseudo(); + auto RAState = BC.MIB->getRAState(*II); + if (!RAState) { + BC.errs() << "BOLT-ERROR: unknown RAState after inferUnknownStates " + << " in function " << BF.getPrintName() << "\n"; + PassFailed = true; + return; } + if (*RAState) + BF.addCFIInstruction(*FirstNonEmpty, II, + MCCFIInstruction::createNegateRAState(nullptr)); } void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) { @@ -96,15 +113,21 @@ void InsertNegateRAState::inferUnknownStates(BinaryFunction &BF) { if (BC.MIB->isCFI(Inst)) continue; - if (!FirstIter && BC.MIB->isRAStateUnknown(Inst)) { - if (BC.MIB->isRASigned(PrevInst) || BC.MIB->isPSignOnLR(PrevInst)) { - BC.MIB->setRASigned(Inst); - } else if (BC.MIB->isRAUnsigned(PrevInst) || - BC.MIB->isPAuthOnLR(PrevInst)) { - BC.MIB->setRAUnsigned(Inst); + auto RAState = BC.MIB->getRAState(Inst); + if (!FirstIter && !RAState) { + if (BC.MIB->isPSignOnLR(PrevInst)) + RAState = true; + else if (BC.MIB->isPAuthOnLR(PrevInst)) + RAState = false; + else { + auto PrevRAState = BC.MIB->getRAState(PrevInst); + RAState = PrevRAState ? *PrevRAState : false; } + BC.MIB->setRAState(Inst, *RAState); } else { FirstIter = false; + if (!RAState) + BC.MIB->setRAState(Inst, BF.getInitialRAState()); } PrevInst = Inst; } @@ -135,6 +158,8 @@ Error InsertNegateRAState::runOnFunctions(BinaryContext &BC) { << " functions " << format("(%.2lf%%).\n", (100.0 * FunctionsModified) / BC.getBinaryFunctions().size()); + if (PassFailed) + return createFatalBOLTError(""); return Error::success(); } diff --git a/bolt/lib/Passes/MarkRAStates.cpp b/bolt/lib/Passes/MarkRAStates.cpp index b262d66732b7d..51075be0e1ac2 100644 --- a/bolt/lib/Passes/MarkRAStates.cpp +++ b/bolt/lib/Passes/MarkRAStates.cpp @@ -72,9 +72,6 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) { BF.setIgnored(); return false; } - // The signing instruction itself is unsigned, the next will be - // signed. - BC.MIB->setRAUnsigned(Inst); } else if (BC.MIB->isPAuthOnLR(Inst)) { if (!RAState) { // RA authenticating instructions should only follow signed RA state. @@ -86,15 +83,10 @@ bool MarkRAStates::runOnFunction(BinaryFunction &BF) { BF.setIgnored(); return false; } - // The authenticating instruction itself is signed, but the next will be - // unsigned. - BC.MIB->setRASigned(Inst); - } else if (RAState) { - BC.MIB->setRASigned(Inst); - } else { - BC.MIB->setRAUnsigned(Inst); } + BC.MIB->setRAState(Inst, RAState); + // Updating RAState. All updates are valid from the next instruction. // Because the same instruction can have remember and restore, the order // here is relevant. This is the reason to loop over Annotations instead diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 1013bfc575747..da608370645ac 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1069,7 +1069,7 @@ def AVRSignal : InheritableAttr, TargetSpecificAttr { } def AsmLabel : InheritableAttr { - let Spellings = [CustomKeyword<"asm">, CustomKeyword<"__asm__">]; + let Spellings = [CustomKeyword<"asm">, CustomKeyword<"__asm">, CustomKeyword<"__asm__">]; let Args = [ // Label specifies the mangled name for the decl. StringArgument<"Label">, ]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 1be9a96aa44de..f1dbd8af6093a 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -4295,17 +4295,17 @@ used by other languages. (This prefix is also added to the standard Itanium C++ ABI prefix on "mangled" symbol names, so that e.g. on such targets the true symbol name for a C++ variable declared as ``int cppvar;`` would be ``__Z6cppvar``; note the two underscores.) This prefix is *not* added to the -symbol names specified by the ``asm`` attribute; programmers wishing to match a -C symbol name must compensate for this. +symbol names specified by the ``__asm`` attribute; programmers wishing to match +a C symbol name must compensate for this. For example, consider the following C code: .. code-block:: c - int var1 asm("altvar") = 1; // "altvar" in symbol table. + int var1 __asm("altvar") = 1; // "altvar" in symbol table. int var2 = 1; // "_var2" in symbol table. - void func1(void) asm("altfunc"); + void func1(void) __asm("altfunc"); void func1(void) {} // "altfunc" in symbol table. void func2(void) {} // "_func2" in symbol table. diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index bf51c3e42719c..735f3157b694e 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -696,6 +696,10 @@ namespace clang { ExpectedStmt VisitCXXFoldExpr(CXXFoldExpr *E); ExpectedStmt VisitRequiresExpr(RequiresExpr* E); ExpectedStmt VisitConceptSpecializationExpr(ConceptSpecializationExpr* E); + ExpectedStmt + VisitSubstNonTypeTemplateParmPackExpr(SubstNonTypeTemplateParmPackExpr *E); + ExpectedStmt VisitPseudoObjectExpr(PseudoObjectExpr *E); + ExpectedStmt VisitCXXParenListInitExpr(CXXParenListInitExpr *E); // Helper for chaining together multiple imports. If an error is detected, // subsequent imports will return default constructed nodes, so that failure @@ -9273,6 +9277,50 @@ ASTNodeImporter::VisitConceptSpecializationExpr(ConceptSpecializationExpr *E) { const_cast(CSD), &Satisfaction); } +ExpectedStmt ASTNodeImporter::VisitSubstNonTypeTemplateParmPackExpr( + SubstNonTypeTemplateParmPackExpr *E) { + Error Err = Error::success(); + auto ToType = importChecked(Err, E->getType()); + auto ToPackLoc = importChecked(Err, E->getParameterPackLocation()); + auto ToArgPack = importChecked(Err, E->getArgumentPack()); + auto ToAssociatedDecl = importChecked(Err, E->getAssociatedDecl()); + if (Err) + return std::move(Err); + + return new (Importer.getToContext()) SubstNonTypeTemplateParmPackExpr( + ToType, E->getValueKind(), ToPackLoc, ToArgPack, ToAssociatedDecl, + E->getIndex(), E->getFinal()); +} + +ExpectedStmt ASTNodeImporter::VisitPseudoObjectExpr(PseudoObjectExpr *E) { + SmallVector ToSemantics(E->getNumSemanticExprs()); + if (Error Err = ImportContainerChecked(E->semantics(), ToSemantics)) + return std::move(Err); + auto ToSyntOrErr = import(E->getSyntacticForm()); + if (!ToSyntOrErr) + return ToSyntOrErr.takeError(); + return PseudoObjectExpr::Create(Importer.getToContext(), *ToSyntOrErr, + ToSemantics, E->getResultExprIndex()); +} + +ExpectedStmt +ASTNodeImporter::VisitCXXParenListInitExpr(CXXParenListInitExpr *E) { + Error Err = Error::success(); + auto ToType = importChecked(Err, E->getType()); + auto ToInitLoc = importChecked(Err, E->getInitLoc()); + auto ToBeginLoc = importChecked(Err, E->getBeginLoc()); + auto ToEndLoc = importChecked(Err, E->getEndLoc()); + if (Err) + return std::move(Err); + + SmallVector ToArgs(E->getInitExprs().size()); + if (Error Err = ImportContainerChecked(E->getInitExprs(), ToArgs)) + return std::move(Err); + return CXXParenListInitExpr::Create(Importer.getToContext(), ToArgs, ToType, + E->getUserSpecifiedInitExprs().size(), + ToInitLoc, ToBeginLoc, ToEndLoc); +} + Error ASTNodeImporter::ImportOverriddenMethods(CXXMethodDecl *ToMethod, CXXMethodDecl *FromMethod) { Error ImportErrors = Error::success(); diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index d2b8bf5497d26..4cb47cb5aea5a 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" @@ -6289,6 +6290,24 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, pushDestroy(QualType::DK_nontrivial_c_struct, Ret.getAggregateAddress(), RetTy); + // Generate function declaration DISuprogram in order to be used + // in debug info about call sites. + if (CGDebugInfo *DI = getDebugInfo()) { + // Ensure call site info would actually be emitted before collecting + // further callee info. + if (CalleeDecl && !CalleeDecl->hasAttr() && + DI->getCallSiteRelatedAttrs() != llvm::DINode::FlagZero) { + CodeGenFunction CalleeCGF(CGM); + const GlobalDecl &CalleeGlobalDecl = + Callee.getAbstractInfo().getCalleeDecl(); + CalleeCGF.CurGD = CalleeGlobalDecl; + FunctionArgList Args; + QualType ResTy = CalleeCGF.BuildFunctionArgList(CalleeGlobalDecl, Args); + DI->EmitFuncDeclForCallSite( + CI, DI->getFunctionType(CalleeDecl, ResTy, Args), CalleeGlobalDecl); + } + } + return Ret; } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index c4c34d9c0f5b5..d20497ba45ec1 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -5019,7 +5019,7 @@ void CGDebugInfo::EmitFunctionDecl(GlobalDecl GD, SourceLocation Loc, void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke, QualType CalleeType, - const FunctionDecl *CalleeDecl) { + GlobalDecl CalleeGlobalDecl) { if (!CallOrInvoke) return; auto *Func = dyn_cast(CallOrInvoke->getCalledOperand()); @@ -5028,6 +5028,9 @@ void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke, if (Func->getSubprogram()) return; + const FunctionDecl *CalleeDecl = + cast(CalleeGlobalDecl.getDecl()); + // Do not emit a declaration subprogram for a function with nodebug // attribute, or if call site info isn't required. if (CalleeDecl->hasAttr() || @@ -5038,7 +5041,8 @@ void CGDebugInfo::EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke, // create the one describing the function in order to have complete // call site debug info. if (!CalleeDecl->isStatic() && !CalleeDecl->isInlined()) - EmitFunctionDecl(CalleeDecl, CalleeDecl->getLocation(), CalleeType, Func); + EmitFunctionDecl(CalleeGlobalDecl, CalleeDecl->getLocation(), CalleeType, + Func); } void CGDebugInfo::EmitInlineFunctionStart(CGBuilderTy &Builder, GlobalDecl GD) { diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h index 6ea825f9693c0..a4d26d168ea79 100644 --- a/clang/lib/CodeGen/CGDebugInfo.h +++ b/clang/lib/CodeGen/CGDebugInfo.h @@ -511,7 +511,7 @@ class CGDebugInfo { /// This is needed for call site debug info. void EmitFuncDeclForCallSite(llvm::CallBase *CallOrInvoke, QualType CalleeType, - const FunctionDecl *CalleeDecl); + GlobalDecl CalleeGlobalDecl); /// Constructs the debug code for exiting a function. void EmitFunctionEnd(CGBuilderTy &Builder, llvm::Function *Fn); @@ -686,6 +686,10 @@ class CGDebugInfo { /// Emit symbol for debugger that holds the pointer to the vtable. void emitVTableSymbol(llvm::GlobalVariable *VTable, const CXXRecordDecl *RD); + /// Return flags which enable debug info emission for call sites, provided + /// that it is supported and enabled. + llvm::DINode::DIFlags getCallSiteRelatedAttrs() const; + private: /// Amend \p I's DebugLoc with \p Group (its source atom group) and \p /// Rank (lower nonzero rank is higher precedence). Does nothing if \p I @@ -864,10 +868,6 @@ class CGDebugInfo { StringRef LinkageName, llvm::dwarf::MemorySpace MS, llvm::GlobalVariable *Var, llvm::DIScope *DContext); - /// Return flags which enable debug info emission for call sites, provided - /// that it is supported and enabled. - llvm::DINode::DIFlags getCallSiteRelatedAttrs() const; - /// Get the printing policy for producing names for debug info. PrintingPolicy getPrintingPolicy() const; diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index c723b9c48f7f9..13147c4f7d54d 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -6647,15 +6647,6 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, E == MustTailCall, E->getExprLoc()); if (auto *CalleeDecl = dyn_cast_or_null(TargetDecl)) { - // Generate function declaration DISuprogram in order to be used - // in debug info about call sites. - if (CGDebugInfo *DI = getDebugInfo()) { - FunctionArgList Args; - QualType ResTy = BuildFunctionArgList(CalleeDecl, Args); - DI->EmitFuncDeclForCallSite(LocalCallOrInvoke, - DI->getFunctionType(CalleeDecl, ResTy, Args), - CalleeDecl); - } if (CalleeDecl->hasAttr() || CalleeDecl->hasAttr()) { // Function has 'malloc' (aka. 'restrict') or 'alloc_size' attribute. diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 5041f59a75c6d..3452ce5339174 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -850,8 +850,11 @@ void ToolChain::addFortranRuntimeLibs(const ArgList &Args, options::OPT_fno_openmp, false)) { Driver::OpenMPRuntimeKind OMPRuntime = getDriver().getOpenMPRuntime(Args); ToolChain::RuntimeLibType RuntimeLib = GetRuntimeLibType(Args); - if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc) + if ((OMPRuntime == Driver::OMPRT_OMP && + RuntimeLib == ToolChain::RLT_Libgcc) && + !getTriple().isKnownWindowsMSVCEnvironment()) { CmdArgs.push_back("-latomic"); + } } } diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 203b600078842..4a02c96620335 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -511,7 +511,7 @@ _mm512_packs_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -519,9 +519,8 @@ _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, (__v32hi)_mm512_packs_epi32(__A, __B), (__v32hi)__W); @@ -532,7 +531,7 @@ _mm512_packs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -540,7 +539,7 @@ _mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -553,7 +552,7 @@ _mm512_packus_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -561,7 +560,7 @@ _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, @@ -574,7 +573,7 @@ _mm512_packus_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -582,7 +581,7 @@ _mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, @@ -595,17 +594,15 @@ _mm512_adds_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_adds_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_adds_epi8(__A, __B), (__v64qi)_mm512_setzero_si512()); @@ -616,7 +613,7 @@ _mm512_adds_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -624,7 +621,7 @@ _mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -637,7 +634,7 @@ _mm512_adds_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -645,7 +642,7 @@ _mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -658,17 +655,15 @@ _mm512_adds_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_adds_epu16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_adds_epu16(__A, __B), (__v32hi)_mm512_setzero_si512()); @@ -886,7 +881,7 @@ _mm512_subs_epi8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -894,7 +889,7 @@ _mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -907,7 +902,7 @@ _mm512_subs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -915,7 +910,7 @@ _mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -928,7 +923,7 @@ _mm512_subs_epu8(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -936,7 +931,7 @@ _mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, @@ -949,7 +944,7 @@ _mm512_subs_epu16(__m512i __A, __m512i __B) { return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -957,7 +952,7 @@ _mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, @@ -1191,14 +1186,14 @@ _mm512_unpackhi_epi8(__m512i __A, __m512i __B) { 62, 64+62, 63, 64+63); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpackhi_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpackhi_epi8(__A, __B), @@ -1218,14 +1213,14 @@ _mm512_unpackhi_epi16(__m512i __A, __m512i __B) { 30, 32+30, 31, 32+31); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpackhi_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpackhi_epi16(__A, __B), @@ -1253,14 +1248,14 @@ _mm512_unpacklo_epi8(__m512i __A, __m512i __B) { 54, 64+54, 55, 64+55); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpacklo_epi8(__A, __B), (__v64qi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpacklo_epi8(__A, __B), @@ -1280,14 +1275,14 @@ _mm512_unpacklo_epi16(__m512i __A, __m512i __B) { 26, 32+26, 27, 32+27); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpacklo_epi16(__A, __B), (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpacklo_epi16(__A, __B), @@ -1566,7 +1561,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) { ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a), \ (int)(imm))) -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, @@ -1574,23 +1569,21 @@ _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) (__v32hi) __W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, (__v32hi) __A, (__v32hi) _mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, (__v64qi) __A, (__v64qi) __W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, @@ -1598,7 +1591,7 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) (__v64qi) _mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) { return (__m512i) __builtin_ia32_selectb_512(__M, @@ -1606,9 +1599,8 @@ _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) (__v64qi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_set1_epi8 (__mmask64 __M, char __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_set1_epi8(__mmask64 __M, char __A) { return (__m512i) __builtin_ia32_selectb_512(__M, (__v64qi) _mm512_set1_epi8(__A), (__v64qi) _mm512_setzero_si512()); @@ -1801,7 +1793,7 @@ _mm512_broadcastb_epi8(__m128i __A) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectb_512(__M, @@ -1809,15 +1801,14 @@ _mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) (__v64qi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectb_512(__M, (__v64qi) _mm512_broadcastb_epi8(__A), (__v64qi) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) { return (__m512i) __builtin_ia32_selectw_512(__M, @@ -1825,9 +1816,8 @@ _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) (__v32hi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_set1_epi16 (__mmask32 __M, short __A) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_set1_epi16(__mmask32 __M, short __A) { return (__m512i) __builtin_ia32_selectw_512(__M, (__v32hi) _mm512_set1_epi16(__A), (__v32hi) _mm512_setzero_si512()); @@ -1840,7 +1830,7 @@ _mm512_broadcastw_epi16(__m128i __A) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectw_512(__M, @@ -1848,7 +1838,7 @@ _mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) (__v32hi) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectw_512(__M, diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h index 575c0c8962662..d23188ab02b6c 100644 --- a/clang/lib/Headers/avx512vlbwintrin.h +++ b/clang/lib/Headers/avx512vlbwintrin.h @@ -536,14 +536,14 @@ _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_packs_epi32(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -551,7 +551,7 @@ _mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -559,7 +559,7 @@ _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -567,7 +567,7 @@ _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -575,7 +575,7 @@ _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -583,7 +583,7 @@ _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -591,7 +591,7 @@ _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -599,7 +599,7 @@ _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -607,7 +607,7 @@ _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -615,7 +615,7 @@ _mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -623,7 +623,7 @@ _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -631,7 +631,7 @@ _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -639,7 +639,7 @@ _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -647,7 +647,7 @@ _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -655,7 +655,7 @@ _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -663,7 +663,7 @@ _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -671,7 +671,7 @@ _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -679,7 +679,7 @@ _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -687,7 +687,7 @@ _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -695,7 +695,7 @@ _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -703,7 +703,7 @@ _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -711,7 +711,7 @@ _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -719,7 +719,7 @@ _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -727,7 +727,7 @@ _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -735,7 +735,7 @@ _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -743,7 +743,7 @@ _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -751,7 +751,7 @@ _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -759,7 +759,7 @@ _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -767,7 +767,7 @@ _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -775,7 +775,7 @@ _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -783,7 +783,7 @@ _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1095,7 +1095,7 @@ _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1103,7 +1103,7 @@ _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1111,7 +1111,7 @@ _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1119,7 +1119,7 @@ _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1127,7 +1127,7 @@ _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1135,7 +1135,7 @@ _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1143,7 +1143,7 @@ _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1151,7 +1151,7 @@ _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1159,7 +1159,7 @@ _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1167,7 +1167,7 @@ _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1175,7 +1175,7 @@ _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1183,7 +1183,7 @@ _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1191,7 +1191,7 @@ _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1199,7 +1199,7 @@ _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1207,7 +1207,7 @@ _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1215,7 +1215,7 @@ _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1432,14 +1432,14 @@ _mm_cvtepi16_epi8(__m128i __A) { 12, 13, 14, 15); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, (__v16qi) _mm_setzero_si128(), @@ -1588,112 +1588,112 @@ _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { (__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpackhi_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpackhi_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpackhi_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpackhi_epi16(__A, __B), (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpackhi_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpackhi_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpacklo_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpacklo_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpacklo_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpacklo_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpacklo_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpacklo_epi16(__A, __B), (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpacklo_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpacklo_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1701,7 +1701,7 @@ _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1709,7 +1709,7 @@ _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1717,7 +1717,7 @@ _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1726,7 +1726,7 @@ _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1734,7 +1734,7 @@ _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1742,7 +1742,7 @@ _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1750,7 +1750,7 @@ _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1877,7 +1877,7 @@ _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1885,7 +1885,7 @@ _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2173,7 +2173,7 @@ _mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) { (__v32qi) _mm256_setzero_si256 ()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) { return (__m128i) __builtin_ia32_selectb_128(__M, @@ -2181,7 +2181,7 @@ _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) (__v16qi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_set1_epi8 (__mmask16 __M, char __A) { return (__m128i) __builtin_ia32_selectb_128(__M, @@ -2189,7 +2189,7 @@ _mm_maskz_set1_epi8 (__mmask16 __M, char __A) (__v16qi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) { return (__m256i) __builtin_ia32_selectb_256(__M, @@ -2197,7 +2197,7 @@ _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) (__v32qi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_set1_epi8 (__mmask32 __M, char __A) { return (__m256i) __builtin_ia32_selectb_256(__M, @@ -2528,7 +2528,7 @@ _mm256_movm_epi16 (__mmask16 __A) return (__m256i) __builtin_ia32_cvtmask2w256 (__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128(__M, @@ -2536,7 +2536,7 @@ _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) (__v16qi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128(__M, @@ -2544,7 +2544,7 @@ _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) (__v16qi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectb_256(__M, @@ -2552,7 +2552,7 @@ _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) (__v32qi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectb_256(__M, @@ -2560,7 +2560,7 @@ _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) (__v32qi) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128(__M, @@ -2568,7 +2568,7 @@ _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) (__v8hi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128(__M, @@ -2576,7 +2576,7 @@ _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256(__M, @@ -2584,7 +2584,7 @@ _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) (__v16hi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256(__M, @@ -2592,7 +2592,7 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) (__v16hi) _mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) { return (__m256i) __builtin_ia32_selectw_256 (__M, @@ -2600,7 +2600,7 @@ _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) (__v16hi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) { return (__m256i) __builtin_ia32_selectw_256(__M, @@ -2608,7 +2608,7 @@ _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) (__v16hi) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) { return (__m128i) __builtin_ia32_selectw_128(__M, @@ -2616,7 +2616,7 @@ _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) (__v8hi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_maskz_set1_epi16 (__mmask8 __M, short __A) { return (__m128i) __builtin_ia32_selectw_128(__M, diff --git a/clang/test/C/C2y/n3348.c b/clang/test/C/C2y/n3348.c new file mode 100644 index 0000000000000..e20c9f74883f9 --- /dev/null +++ b/clang/test/C/C2y/n3348.c @@ -0,0 +1,44 @@ +// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s + +/* WG14 N3348: No + * Matching of Multi-Dimensional Arrays in Generic Selection Expressions + * + * This allows use of * in a _Generic association as a placeholder for any size + * value. + * + * FIXME: Clang doesn't yet implement this paper. When we do implement it, we + * should expose the functionality in earlier language modes (C89) for + * compatibility with GCC. + */ + +void test(int n, int m) { + static_assert(1 == _Generic(int[3][2], int[3][*]: 1, int[2][*]: 0)); /* expected-error {{star modifier used outside of function prototype}} + expected-error {{array has incomplete element type 'int[]'}} + */ + static_assert(1 == _Generic(int[3][2], int[*][2]: 1, int[*][3]: 0)); // expected-error {{star modifier used outside of function prototype}} + static_assert(1 == _Generic(int[3][n], int[3][*]: 1, int[2][*]: 0)); /* expected-error {{star modifier used outside of function prototype}} + expected-error {{array has incomplete element type 'int[]'}} + */ + static_assert(1 == _Generic(int[n][m], int[*][*]: 1, char[*][*]: 0)); /* expected-error 2 {{star modifier used outside of function prototype}} + expected-error {{array has incomplete element type 'int[]'}} + */ + static_assert(1 == _Generic(int(*)[2], int(*)[*]: 1)); // expected-error {{star modifier used outside of function prototype}} +} + +void questionable() { + // GCC accepts this despite the * appearing outside of a generic association, + // but it's not clear whether that's intentionally supported or an oversight. + // It gives a warning about * being used outside of a declaration, but not + // with an associated warning group. + static_assert(1 == _Generic(int[*][*], int[2][100]: 1)); /* expected-error 2 {{star modifier used outside of function prototype}} + expected-error {{array has incomplete element type 'int[]'}} + */ + // GCC claims this matches multiple associations, so the functionality seems + // like it may be intended to work? + (void)_Generic(int[*][*], /* expected-error 2 {{star modifier used outside of function prototype}} + expected-error {{array has incomplete element type 'int[]'}} + */ + int[2][100]: 1, + int[3][1000]: 2, + ); +} diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 834e140018c34..0b73c7b14d869 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -1205,12 +1205,16 @@ __m512i test_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_packs_epi32(__M,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_packs_epi32((__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,30000,32768,-32769,65535,-65536},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,22222,-22222,40000,-40000}),0,-32768,0,-32768,0,1,0,32767,0,-32768,0,-42,0,32767,0,-32768,0,1,0,30000,0,-32768,0,-32768,0,-32768,0,-32768,0,-22222,0,-32768)); + __m512i test_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_packs_epi32 // CHECK: @llvm.x86.avx512.packssdw.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_packs_epi32(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_packs_epi32((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,30000,32768,-32769,65535,-65536},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,22222,-22222,40000,-40000}),1,-32768,3,-32768,5,1,7,32767,9,-32768,11,-42,13,32767,15,-32768,17,1,19,30000,21,-32768,23,-32768,25,-32768,27,-32768,29,-22222,31,-32768)); + __m512i test_mm512_packs_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_packs_epi16 // CHECK: @llvm.x86.avx512.packsswb.512 @@ -1223,48 +1227,62 @@ __m512i test_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_packs_epi16(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_packs_epi16((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){130,-200,127,-128,300,-1000,42,-42,32767,-32767,127,-128,30000,-30000,90,-90,130,-200,0,-1,126,-127,128,-129,500,-500,7,-7,255,-255,127,-128},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,5,-5,100,-100,127,-128,512,-512,1,2,-2,300,-300,127,-128,42,0,1,-1,127,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-42,9,1,11,127,13,127,15,-128,17,-128,19,-128,21,-128,23,-90,25,-5,27,-100,29,-128,31,-128,33,-128,35,-1,37,-127,39,-128,41,2,43,127,45,127,47,42,49,-128,51,-7,53,-128,55,-128,57,1,59,127,61,90,63,-128)); + __m512i test_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_packs_epi16 // CHECK: @llvm.x86.avx512.packsswb.512 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_packs_epi16(__M,__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_packs_epi16((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){130,-200,127,-128,300,-1000,42,-42,32767,-32767,127,-128,30000,-30000,90,-90,130,-200,0,-1,126,-127,128,-129,500,-500,7,-7,255,-255,127,-128},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,5,-5,100,-100,127,-128,512,-512,1,2,-2,300,-300,127,-128,42,0,1,-1,127,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-42,0,1,0,127,0,127,0,-128,0,-128,0,-128,0,-128,0,-90,0,-5,0,-100,0,-128,0,-128,0,-128,0,-1,0,-127,0,-128,0,2,0,127,0,127,0,42,0,-128,0,-7,0,-128,0,-128,0,1,0,127,0,90,0,-128)); + __m512i test_mm512_packus_epi32(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_packus_epi32 // CHECK: @llvm.x86.avx512.packusdw.512 return _mm512_packus_epi32(__A,__B); } TEST_CONSTEXPR(match_v32hi(_mm512_packus_epi32((__m512i)(__v16si){40000, -50000, 32767, -32768, 70000, -70000, 42, -42, 0, 1, -1, 65535, 32768, -32769, 22222, -22222}, (__m512i)(__v16si){0, 1, -1, 65536, -1000000, 1000000, 32768, -32769, 123456, -123456, 32767, -32768, 40000, -40000, 65535, 0}), -25536, 0, 32767, 0, 0, 1, 0, -1, -1, 0, 42, 0, 0, -1, -32768, 0, 0, 1, 0, -1, -1, 0, 32767, 0, -32768, 0, 22222, 0, -25536, 0, -1, 0)); + __m512i test_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_packus_epi32 // CHECK: @llvm.x86.avx512.packusdw.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_packus_epi32(__M,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_packus_epi32((__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,65535,32768,-32769,22222,-22222},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,40000,-40000,65535,0}),0,0,0,0,0,1,0,-1,0,0,0,0,0,-1,0,0,0,1,0,-1,0,0,0,0,0,0,0,0,0,0,0,0)); + __m512i test_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_packus_epi32 // CHECK: @llvm.x86.avx512.packusdw.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_packus_epi32(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_packus_epi32((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v16si){40000,-50000,32767,-32768,70000,-70000,42,-42,0,1,-1,65535,32768,-32769,22222,-22222},(__m512i)(__v16si){0,1,-1,65536,-1000000,1000000,32768,-32769,123456,-123456,32767,-32768,40000,-40000,65535,0}),1,0,3,0,5,1,7,-1,9,0,11,0,13,-1,15,0,17,1,19,-1,21,0,23,0,25,0,27,0,29,0,31,0)); + __m512i test_mm512_packus_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_packus_epi16 // CHECK: @llvm.x86.avx512.packuswb.512 return _mm512_packus_epi16(__A,__B); } TEST_CONSTEXPR(match_v64qi(_mm512_packus_epi16((__m512i)(__v32hi){-1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129, -1, 0, 1, 127, 128, 255, 256, -200, 300, 42, -42, 500, 20000, -32768, 129, -129}, (__m512i)(__v32hi){0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90, 0, 1, -1, 255, -129, 128, 20000, -32768, 32767, -32767, 127, -128, 30000, -30000, 90, -90}), 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0, 0, 0, 1, 127, -128, -1, -1, 0, 0, 1, 0, -1, 0, -128, -1, 0, -1, 42, 0, -1, -1, 0, -127, 0, -1, 0, 127, 0, -1, 0, 90, 0)); + __m512i test_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_packus_epi16 // CHECK: @llvm.x86.avx512.packuswb.512 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_packus_epi16(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_packus_epi16((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129,-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90,0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),1,0,3,127,5,-1,7,0,9,1,11,-1,13,-128,15,0,17,42,19,-1,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,127,37,-1,39,0,41,1,43,-1,45,-128,47,0,49,42,51,-1,53,0,55,0,57,0,59,0,61,0,63,0)); + __m512i test_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_packus_epi16 // CHECK: @llvm.x86.avx512.packuswb.512 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_packus_epi16(__M,__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_packus_epi16((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v32hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129,-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m512i)(__v32hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90,0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0)); + __m512i test_mm512_adds_epi8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_adds_epi8 // CHECK: @llvm.sadd.sat.v64i8 @@ -1278,18 +1296,22 @@ __m512i test_mm512_mask_adds_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m51 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_adds_epi8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_adds_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,17,+34,19,+38,21,+42,23,+46,25,+50,27,+54,29,+58,31,+62,33,+66,35,+70,37,+74,39,+78,41,+82,43,+86,45,+90,47,+94,49,+127,51,+127,53,-80,+55,-30,57,+30,59,+90,61,+20,63,+10)); + __m512i test_mm512_maskz_adds_epi8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_adds_epi8 // CHECK: @llvm.sadd.sat.v64i8 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_adds_epi8(__U,__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_adds_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m512i)(__v64qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,-28,+29,-30,+31,-32,+33,-34,+35,-36,+37,-38,+39,-40,+41,-42,+43,-44,+45,-46,+47,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+34,0,+38,0,+42,0,+46,0,+50,0,+54,0,+58,0,+62,0,+66,0,+70,0,+74,0,+78,0,+82,0,+86,0,+90,0,+94,0,+127,0,+127,0,-80,0,-30,0,+30,0,+90,0,+20,0,+10)); + __m512i test_mm512_adds_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_adds_epi16 // CHECK: @llvm.sadd.sat.v32i16 return _mm512_adds_epi16(__A,__B); } -TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +32000, -32000, +32000, -32000}, (__m512i)(__v32hi){0, +1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, +800, -800, -800, +800}), 0, +2, -4, +6, -8, +10, -12, +14, -16, +18, -20, +22, -24, +26, -28, +30, -32, +34, -36, +38, -40, +42, -44, +46, -48, +50, -52, +54, +32767, -32768, +31200, -31200)); +TEST_CONSTEXPR(match_v32hi(_mm512_adds_epi16((__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,-32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,-4,+6,-8,+10,-12,+14,-16,+18,-20,+22,-24,+26,-28,+30,-32,+34,-36,+38,-40,+42,-44,+46,-48,+50,-52,+54,+32767,-32768,+31200,-31200)); __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_adds_epi16 @@ -1297,12 +1319,16 @@ __m512i test_mm512_mask_adds_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m5 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_adds_epi16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_adds_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,17,+34,19,+38,21,+42,23,+46,25,+50,27,+54,29,-32768,31,+32767)); + __m512i test_mm512_maskz_adds_epi16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_adds_epi16 // CHECK: @llvm.sadd.sat.v32i16 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_adds_epi16(__U,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_adds_epi16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m512i)(__v32hi){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,-16,+17,-18,+19,-20,+21,-22,+23,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+34,0,+38,0,+42,0,+46,0,+50,0,+54,0,-32768,0,+32767)); + __m512i test_mm512_adds_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_adds_epu8 // CHECK-NOT: @llvm.x86.avx512.mask.paddus.b.512 @@ -1318,7 +1344,7 @@ __m512i test_mm512_mask_adds_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_adds_epu8(__W,__U,__A,__B); } -TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535)); +TEST_CONSTEXPR(match_v64qu(_mm512_mask_adds_epu8((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,9,+126,11,+190,13,+254,15,+255,17,+127,19,+191,21,+255,23,+255,25,+190,27,+254,29,+255,31,+255,33,+191,35,+255,37,+255,39,+255,41,+254,43,+255,45,+255,47,+255,49,+255,51,+255,53,+255,55,+255,57,+255,59,+255,61,+255,63,+255)); __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_adds_epu8 @@ -1327,12 +1353,16 @@ __m512i test_mm512_maskz_adds_epu8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_adds_epu8(__U,__A,__B); } +TEST_CONSTEXPR(match_v64qu(_mm512_maskz_adds_epu8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+126,0,+190,0,+254,0,+255,0,+127,0,+191,0,+255,0,+255,0,+190,0,+254,0,+255,0,+255,0,+191,0,+255,0,+255,0,+255,0,+254,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255)); + __m512i test_mm512_adds_epu16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_adds_epu16 // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512 // CHECK: call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_adds_epu16(__A,__B); } +TEST_CONSTEXPR(match_v32hu(_mm512_adds_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, +32767, +32768, +65535, +16384, +32768, +49151, +49152, +65535, +65535, +32767, +49151, +65534, +65535, +65535, +65535, +32768, +49152, +65535, +65535, +65535, +65535, +49152, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535, +65535)); + __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_adds_epu16 // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512 @@ -1340,6 +1370,8 @@ __m512i test_mm512_mask_adds_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m5 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_adds_epu16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v32hu(_mm512_mask_adds_epu16((__m512i)(__v32hu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hu){0,0,0,0,+16384,+16384,+16384,+16384,+16384,+16384,+32767,+32767,+32767,+32767,+32767,+32767,+32768,+32768,+32768,+32768,+32768,+32768,+49152,+49152,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m512i)(__v32hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),1,+32767,3,+65535,5,+32768,7,+49152,9,+65535,11,+49151,13,+65535,15,+65535,17,+49152,19,+65535,21,+65535,23,+65535,25,+65535,27,+65535,29,+65535,31,+65535)); + __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_adds_epu16 // CHECK-NOT: @llvm.x86.avx512.mask.paddus.w.512 @@ -1347,6 +1379,8 @@ __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_adds_epu16(__U,__A,__B); } +TEST_CONSTEXPR(match_v32hu(_mm512_maskz_adds_epu16((__mmask32)0xAAAAAAAA,(__m512i)(__v32hu){0,0,0,0,+16384,+16384,+16384,+16384,+16384,+16384,+32767,+32767,+32767,+32767,+32767,+32767,+32768,+32768,+32768,+32768,+32768,+32768,+49152,+49152,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m512i)(__v32hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+16384,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),0,+32767,0,+65535,0,+32768,0,+49152,0,+65535,0,+49151,0,+65535,0,+65535,0,+49152,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535,0,+65535)); + __m512i test_mm512_avg_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_avg_epu8 // CHECK: @llvm.x86.avx512.pavg.b.512 @@ -1640,12 +1674,16 @@ __m512i test_mm512_mask_subs_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m51 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_subs_epi8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_subs_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,17,-100,19,20,21,-22,23,100,25,-100,27,28,29,-30,31,100,33,-100,35,36,37,-38,39,100,41,-100,43,44,45,-46,47,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m512i)(__v64qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,17,100,19,20,21,22,23,-100,25,100,27,28,29,30,31,-100,33,100,35,36,37,38,39,-100,41,100,43,44,45,46,47,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,9,-128,11,0,13,-28,15,127,17,-128,19,0,21,-44,23,127,25,-128,27,0,29,-60,31,127,33,-128,35,0,37,-76,39,127,41,-128,43,0,45,-92,47,127,49,-128,51,0,53,-108,55,127,57,-128,59,0,61,-124,63,127)); + __m512i test_mm512_maskz_subs_epi8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_subs_epi8 // CHECK: @llvm.ssub.sat.v64i8 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_subs_epi8(__U,__A,__B); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_subs_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,17,-100,19,20,21,-22,23,100,25,-100,27,28,29,-30,31,100,33,-100,35,36,37,-38,39,100,41,-100,43,44,45,-46,47,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m512i)(__v64qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,17,100,19,20,21,22,23,-100,25,100,27,28,29,30,31,-100,33,100,35,36,37,38,39,-100,41,100,43,44,45,46,47,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-28,0,127,0,-128,0,0,0,-44,0,127,0,-128,0,0,0,-60,0,127,0,-128,0,0,0,-76,0,127,0,-128,0,0,0,-92,0,127,0,-128,0,0,0,-108,0,127,0,-128,0,0,0,-124,0,127)); + __m512i test_mm512_subs_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_subs_epi16 // CHECK: @llvm.ssub.sat.v32i16 @@ -1658,18 +1696,24 @@ __m512i test_mm512_mask_subs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m5 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_subs_epi16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_subs_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){1,-30000,3,30000,5,-6,7,8,9,-30000,11,30000,13,-14,15,16,17,-30000,19,30000,21,-22,23,24,25,-30000,27,30000,29,-30,31,32},(__m512i)(__v32hi){1,30000,3,-30000,5,6,7,-8,9,30000,11,-30000,13,14,15,-16,17,30000,19,-30000,21,22,23,-24,25,30000,27,-30000,29,30,31,-32}),1,-32768,3,32767,5,-12,7,16,9,-32768,11,32767,13,-28,15,32,17,-32768,19,32767,21,-44,23,48,25,-32768,27,32767,29,-60,31,64)); + __m512i test_mm512_maskz_subs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_subs_epi16 // CHECK: @llvm.ssub.sat.v32i16 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_subs_epi16(__U,__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_subs_epi16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hi){1,-30000,3,30000,5,-6,7,8,9,-30000,11,30000,13,-14,15,16,17,-30000,19,30000,21,-22,23,24,25,-30000,27,30000,29,-30,31,32},(__m512i)(__v32hi){1,30000,3,-30000,5,6,7,-8,9,30000,11,-30000,13,14,15,-16,17,30000,19,-30000,21,22,23,-24,25,30000,27,-30000,29,30,31,-32}),0,-32768,0,32767,0,-12,0,16,0,-32768,0,32767,0,-28,0,32,0,-32768,0,32767,0,-44,0,48,0,-32768,0,32767,0,-60,0,64)); + __m512i test_mm512_subs_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_subs_epu8 // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512 // CHECK: call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %{{.*}}, <64 x i8> %{{.*}}) return _mm512_subs_epu8(__A,__B); } +TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+64,+64,+64,+64,+64,+64,+64,+64,+127,+127,+127,+127,+127,+127,+127,+127,+128,+128,+128,+128,+128,+128,+128,+128,+191,+191,+191,+191,+191,+191,+191,+191,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m512i)(__v64qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,0,0,0,0,0,0,0,+63,0,0,0,0,0,0,0,+64,+1,0,0,0,0,0,0,+127,+64,+63,0,0,0,0,0,+128,+65,+64,+1,0,0,0,0,+191,+128,+127,+64,+63,0,0,0,+192,+129,+128,+65,+64,+1,0,0,+255,+192,+191,+128,+127,+64,+63,+0)); + __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_subs_epu8 // CHECK-NOT: @llvm.x86.avx512.mask.psubus.b.512 @@ -1677,7 +1721,7 @@ __m512i test_mm512_mask_subs_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m51 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_subs_epu8(__W,__U,__A,__B); } -TEST_CONSTEXPR(match_v64qu(_mm512_subs_epu8((__m512i)(__v64qu){0, 0, 0, 0, 0, 0, 0, 0, +63, +63, +63, +63, +63, +63, +63, +63, +64, +64, +64, +64, +64, +64, +64, +64, +127, +127, +127, +127, +127, +127, +127, +127, +128, +128, +128, +128, +128, +128, +128, +128, +191, +191, +191, +191, +191, +191, +191, +191, +192, +192, +192, +192, +192, +192, +192, +192, +255, +255, +255, +255, +255, +255, +255, +255}, (__m512i)(__v64qu){0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255, 0, +63, +64, +127, +128, +191, +192, +255}), 0, 0, 0, 0, 0, 0, 0, 0, +63, 0, 0, 0, 0, 0, 0, 0, +64, +1, 0, 0, 0, 0, 0, 0, +127, +64, +63, 0, 0, 0, 0, 0, +128, +65, +64, +1, 0, 0, 0, 0, +191, +128, +127, +64, +63, 0, 0, 0, +192, +129, +128, +65, +64, +1, 0, 0, +255, +192, +191, +128, +127, +64, +63, +0)); +TEST_CONSTEXPR(match_v64qu(_mm512_mask_subs_epu8((__m512i)(__v64qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m512i)(__v64qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,9,0,11,1,13,1,15,0,17,200,19,0,21,0,23,254,25,0,27,1,29,1,31,0,33,200,35,0,37,0,39,254,41,0,43,1,45,1,47,0,49,200,51,0,53,0,55,254,57,0,59,1,61,1,63,0)); __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_subs_epu8 @@ -1686,20 +1730,25 @@ __m512i test_mm512_maskz_subs_epu8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_subs_epu8(__U,__A,__B); } +TEST_CONSTEXPR(match_v64qu(_mm512_maskz_subs_epu8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m512i)(__v64qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0)); + __m512i test_mm512_subs_epu16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_subs_epu16 // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512 // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_subs_epu16(__A,__B); } +TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0)); + __m512i test_mm512_mask_subs_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_subs_epu16 // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512 // CHECK: call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_subs_epu16(__W,__U,__A,__B); -TEST_CONSTEXPR(match_v32hu(_mm512_subs_epu16((__m512i)(__v32hu){0, 0, 0, 0, +16384, +16384, +16384, +16384, +16384, +16384, +32767, +32767, +32767, +32767, +32767, +32767, +32768, +32768, +32768, +32768, +32768, +32768, +49152, +49152, +49152, +49152, +49152, +49152, +65535, +65535, +65535, +65535}, (__m512i)(__v32hu){0, +32767, +32768, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +16384, +32767, +32768, +49152, +65535, 0, +32767, +32768, +65535}), 0, 0, 0, 0, +16384, 0, 0, 0, 0, 0, +32767, +16383, 0, 0, 0, 0, +32768, +16384, +1, 0, 0, 0, +49152, +32768, +16385, +16384, 0, 0, +65535, +32768, +32767, 0)); } +TEST_CONSTEXPR(match_v32hu(_mm512_mask_subs_epu16((__m512i)(__v32hu){101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132},(__mmask32)0xAAAAAAAAu,(__m512i)(__v32hu){0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000,0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000},(__m512i)(__v32hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000,0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),101,60000,103,0,105,0,107,65534,109,0,111,0,113,1,115,25000,117,60000,119,0,121,0,123,65534,125,0,127,0,129,1,131,25000)); + __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_subs_epu16 // CHECK-NOT: @llvm.x86.avx512.mask.psubus.w.512 @@ -1707,6 +1756,8 @@ __m512i test_mm512_maskz_subs_epu16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_subs_epu16(__U,__A,__B); } +TEST_CONSTEXPR(match_v32hu(_mm512_maskz_subs_epu16((__mmask32)0xAAAAAAAAu,(__m512i)(__v32hu){51,65000,0,40000,0,100,0,65535,42,0,0,1000,0,1,0,50000,69,65000,0,40000,0,100,0,65535,71,0,0,1000,0,1,0,50000},(__m512i)(__v32hu){2652,5000,0,40000,0,200,0,1,398,1,0,65535,0,0,0,25000,29625,5000,0,40000,0,200,0,1,25274,1,0,65535,0,0,0,25000}),0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000,0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000)); + __m512i test_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, __m512i __B) { // CHECK-LABEL: test_mm512_mask2_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.512 @@ -2041,6 +2092,7 @@ __m512i test_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, _ // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_unpackhi_epi8(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_unpackhi_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,13,-15,15,-16,17,-25,19,-26,21,-27,23,-28,25,-29,27,-30,29,-31,31,-32,33,-41,35,-42,37,-43,39,-44,41,-45,43,-46,45,-47,47,-48,49,-57,51,-58,53,-59,55,-60,57,-61,59,-62,-90,-63,-89,-64)); __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_unpackhi_epi8 @@ -2048,6 +2100,7 @@ __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_unpackhi_epi8(__U, __A, __B); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_unpackhi_epi8((__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,0,-15,0,-16,0,-25,0,-26,0,-27,0,-28,0,-29,0,-30,0,-31,0,-32,0,-41,0,-42,0,-43,0,-44,0,-45,0,-46,0,-47,0,-48,0,-57,0,-58,0,-59,0,-60,0,-61,0,-62,-90,-63,-89,-64)); __m512i test_mm512_unpackhi_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_unpackhi_epi16 @@ -2063,6 +2116,7 @@ __m512i test_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_unpackhi_epi16(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_unpackhi_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),1,204,3,205,5,206,7,207,9,214,11,215,13,216,15,217,17,224,19,225,21,226,23,227,25,234,27,235,136,236,137,237)); __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_unpackhi_epi16 @@ -2070,6 +2124,7 @@ __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_unpackhi_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_unpackhi_epi16((__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),0,204,0,205,0,206,0,207,0,214,0,215,0,216,0,217,0,224,0,225,0,226,0,227,0,234,0,235,136,236,137,237)); __m512i test_mm512_unpacklo_epi8(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_unpacklo_epi8 @@ -2084,6 +2139,7 @@ __m512i test_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, _ // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_unpacklo_epi8(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_unpacklo_epi8((__m512i)(__v64qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,13,-7,15,-8,17,20,19,21,21,22,23,23,25,24,27,25,29,26,31,27,33,40,35,41,37,42,39,43,41,44,43,45,45,46,47,47,49,60,51,61,53,62,55,63,57,64,59,65,-56,66,-57,67)); __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_unpacklo_epi8 @@ -2091,6 +2147,7 @@ __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_unpacklo_epi8(__U, __A, __B); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_unpacklo_epi8((__mmask64)0xFAAAAAAAAAAAAAAA,(__m512i)(__v64qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m512i)(__v64qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,0,-7,0,-8,0,20,0,21,0,22,0,23,0,24,0,25,0,26,0,27,0,40,0,41,0,42,0,43,0,44,0,45,0,46,0,47,0,60,0,61,0,62,0,63,0,64,0,65,-56,66,-57,67)); __m512i test_mm512_unpacklo_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_unpacklo_epi16 @@ -2105,6 +2162,7 @@ __m512i test_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_unpacklo_epi16(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_unpacklo_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),1,200,3,201,5,202,7,203,9,210,11,211,13,212,15,213,17,220,19,221,21,222,23,223,25,230,27,231,132,232,133,233)); __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_unpacklo_epi16 @@ -2112,6 +2170,7 @@ __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_unpacklo_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_unpacklo_epi16((__mmask32)0xFAAAAAAAu,(__m512i)(__v32hi){100,101,102,103,104,105,106,107,110,111,112,113,114,115,116,117,120,121,122,123,124,125,126,127,130,131,132,133,134,135,136,137},(__m512i)(__v32hi){200,201,202,203,204,205,206,207,210,211,212,213,214,215,216,217,220,221,222,223,224,225,226,227,230,231,232,233,234,235,236,237}),0,200,0,201,0,202,0,203,0,210,0,211,0,212,0,213,0,220,0,221,0,222,0,223,0,230,0,231,132,232,133,233)); __m512i test_mm512_cvtepi8_epi16(__m256i __A) { // CHECK-LABEL: test_mm512_cvtepi8_epi16 @@ -2499,24 +2558,28 @@ __m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_mov_epi16(__W, __U, __A); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_mov_epi16((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31},(__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31}),0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-17,18,-19,20,-21,22,-23,24,-25,26,-27,28,-29,30,-31)); __m512i test_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_mov_epi16 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_mov_epi16(__U, __A); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_mov_epi16((__mmask32)0xAAAAAAAA,(__m512i)(__v32hi){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31}),0,-1,0,-3,0,-5,0,-7,0,-9,0,-11,0,-13,0,-15,0,-17,0,-19,0,-21,0,-23,0,-25,0,-27,0,-29,0,-31)); __m512i test_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_mov_epi8 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_mov_epi8(__W, __U, __A); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_mov_epi8((__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63}),0,-1,2,-3,4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16,-17,18,-19,20,-21,22,-23,24,-25,26,-27,28,-29,30,-31,32,-33,34,-35,36,-37,38,-39,40,-41,42,-43,44,-45,46,-47,48,-49,50,-51,52,-53,54,-55,56,-57,58,-59,60,-61,62,-63)); __m512i test_mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_mov_epi8 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_mov_epi8(__U, __A); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_mov_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m512i)(__v64qs){-0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31,-32,-33,-34,-35,-36,-37,-38,-39,-40,-41,-42,-43,-44,-45,-46,-47,-48,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63}),0,-1,0,-3,0,-5,0,-7,0,-9,0,-11,0,-13,0,-15,0,-17,0,-19,0,-21,0,-23,0,-25,0,-27,0,-29,0,-31,0,-33,0,-35,0,-37,0,-39,0,-41,0,-43,0,-45,0,-47,0,-49,0,-51,0,-53,0,-55,0,-57,0,-59,0,-61,0,-63)); __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) { // CHECK-LABEL: test_mm512_mask_set1_epi8 @@ -2585,6 +2648,7 @@ __m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) { // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_set1_epi8(__O, __M, __A); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_set1_epi8((__m512i)(__v64qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask64)0xAAAAAAAAAAAAAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42,17,42,19,42,21,42,23,42,25,42,27,42,29,42,31,42,33,42,35,42,37,42,39,42,41,42,43,42,45,42,47,42,49,42,51,42,53,42,55,42,57,42,59,42,61,42,63,42)); __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) { // CHECK-LABEL: test_mm512_maskz_set1_epi8 @@ -2655,6 +2719,7 @@ __m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) { // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_set1_epi8(__M, __A); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_set1_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(char)42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42)); __mmask64 test_mm512_kunpackd(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) { // CHECK-LABEL: test_mm512_kunpackd @@ -2830,6 +2895,7 @@ __m512i test_mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A) // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_broadcastb_epi8(__O, __M, __A); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_broadcastb_epi8((__m512i)(__v64qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask64)0xAAAAAAAAAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,16,-120,18,-120,20,-120,22,-120,24,-120,26,-120,28,-120,30,-120,32,-120,34,-120,36,-120,38,-120,40,-120,42,-120,44,-120,46,-120,48,-120,50,-120,52,-120,54,-120,56,-120,58,-120,60,-120,62,-120)); __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) { // CHECK-LABEL: test_mm512_maskz_broadcastb_epi8 @@ -2837,6 +2903,7 @@ __m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) { // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_broadcastb_epi8(__M, __A); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_broadcastb_epi8((__mmask64)0xAAAAAAAAAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120)); __m512i test_mm512_broadcastw_epi16(__m128i __A) { // CHECK-LABEL: test_mm512_broadcastw_epi16 @@ -2878,6 +2945,7 @@ __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_broadcastw_epi16(__O, __M, __A); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_broadcastw_epi16((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31},(__mmask32)0xAAAAAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,16,-120,18,-120,20,-120,22,-120,24,-120,26,-120,28,-120,30,-120)); __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) { // CHECK-LABEL: test_mm512_maskz_broadcastw_epi16 @@ -2885,6 +2953,7 @@ __m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_broadcastw_epi16(__M, __A); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_broadcastw_epi16((__mmask32)0xAAAAAAAAu,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120)); __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) { // CHECK-LABEL: test_mm512_mask_set1_epi16 @@ -2923,6 +2992,7 @@ __m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_set1_epi16(__O, __M, __A); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_set1_epi16((__m512i)(__v32hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32},(__mmask32)0xAAAAAAAA,-1),1,-1,3,-1,5,-1,7,-1,9,-1,11,-1,13,-1,15,-1,17,-1,19,-1,21,-1,23,-1,25,-1,27,-1,29,-1,31,-1)); __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) { // CHECK-LABEL: test_mm512_maskz_set1_epi16 @@ -2961,6 +3031,8 @@ __m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_set1_epi16(__M, __A); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_set1_epi16((__mmask32)0xAAAAAAAA,42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42)); + __m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.512 diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index 172a3cb219c8a..28e6afbc24564 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -941,56 +941,28 @@ __m128i test_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) { // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_blend_epi8(__U,__A,__W); } -TEST_CONSTEXPR(match_v16qi( - _mm_mask_blend_epi8( - (__mmask16)0x0001, - (__m128i)(__v16qi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, - (__m128i)(__v16qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 } - ), - 10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 -)); +TEST_CONSTEXPR(match_v16qi(_mm_mask_blend_epi8((__mmask16)0x0001,(__m128i)(__v16qi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m128i)(__v16qi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)); __m256i test_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) { // CHECK-LABEL: test_mm256_mask_blend_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_blend_epi8(__U,__A,__W); } -TEST_CONSTEXPR(match_v32qi( - _mm256_mask_blend_epi8( - (__mmask32) 0x00000001, - (__m256i)(__v32qi) {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, - (__m256i)(__v32qi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25} - ), - 10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 -)); +TEST_CONSTEXPR(match_v32qi(_mm256_mask_blend_epi8((__mmask32)0x00000001,(__m256i)(__v32qi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m256i)(__v32qi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)); __m128i test_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) { // CHECK-LABEL: test_mm_mask_blend_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_blend_epi16(__U,__A,__W); } -TEST_CONSTEXPR(match_v8hi( - _mm_mask_blend_epi16( - (__mmask8)0x01, - (__m128i)(__v8hi){2, 2, 2, 2, 2, 2, 2, 2}, - (__m128i)(__v8hi){ 10,11,12,13,14,15,16,17 } - ), - 10, 2, 2, 2, 2, 2, 2, 2 -)); +TEST_CONSTEXPR(match_v8hi(_mm_mask_blend_epi16((__mmask8)0x01,(__m128i)(__v8hi){2,2,2,2,2,2,2,2},(__m128i)(__v8hi){10,11,12,13,14,15,16,17}),10,2,2,2,2,2,2,2)); __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) { // CHECK-LABEL: test_mm256_mask_blend_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_blend_epi16(__U,__A,__W); } -TEST_CONSTEXPR(match_v16hi( - _mm256_mask_blend_epi16( - (__mmask16)0x0001, - (__m256i)(__v16hi){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, - (__m256i)(__v16hi){ 10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25 } - ), - 10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 -)); +TEST_CONSTEXPR(match_v16hi(_mm256_mask_blend_epi16((__mmask16)0x0001,(__m256i)(__v16hi){2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2},(__m256i)(__v16hi){10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}),10,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)); __m128i test_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_abs_epi8 @@ -1078,48 +1050,63 @@ __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_packs_epi32(__M,__A,__B); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_packs_epi32((__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,65535,-65536},(__m128i)(__v4si){0,50000,40000,-40000}),0,-32768,0,-32768,0,32767,0,-32768)); + __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_packs_epi32 // CHECK: @llvm.x86.sse2.packssdw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_packs_epi32(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_packs_epi32((__m128i)(__v8hi){1,2,3,4,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,65535,-65536},(__m128i)(__v4si){0,50000,40000,-40000}),1,-32768,3,-32768,29,32767,31,-32768)); + __m256i test_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_packs_epi32 // CHECK: @llvm.x86.avx2.packssdw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_packs_epi32(__M,__A,__B); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_packs_epi32((__mmask32)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,65535,-65536},(__m256i)(__v8si){0,1,-1,65536,22222,-22222,40000,-40000}),0,-32768,0,-32768,0,1,0,32767,0,-32768,0,-32768,0,-22222,0,-32768)); + __m256i test_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_packs_epi32 // CHECK: @llvm.x86.avx2.packssdw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_packs_epi32(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_packs_epi32((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,65535,-65536},(__m256i)(__v8si){0,1,-1,65536,22222,-22222,40000,-40000}),1,-32768,3,-32768,5,1,7,32767,25,-32768,27,-32768,29,-22222,31,-32768)); + __m128i test_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_packs_epi16 // CHECK: @llvm.x86.sse2.packsswb // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_packs_epi16(__M,__A,__B); } +TEST_CONSTEXPR(match_v16qi(_mm_maskz_packs_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){130,-200,127,-128,255,-255,127,-128},(__m128i)(__v8hi){0,1,-1,255,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-128,0,1,0,127,0,90,0,-128)); + __m128i test_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_packs_epi16 // CHECK: @llvm.x86.sse2.packsswb // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_packs_epi16(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v16qi(_mm_mask_packs_epi16((__m128i)(__v16qi){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v8hi){130,-200,127,-128,255,-255,127,-128},(__m128i)(__v8hi){0,1,-1,255,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-128,57,1,59,127,61,90,63,-128)); + __m256i test_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_packs_epi16 // CHECK: @llvm.x86.avx2.packsswb // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_packs_epi16(__M,__A,__B); } +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_packs_epi16((__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){130,-200,127,-128,300,-1000,42,-42,500,-500,7,-7,255,-255,127,-128},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,0,1,-1,127,-128,90,-90,-32768}),0,-128,0,-128,0,-128,0,-42,0,1,0,127,0,127,0,-128,0,-128,0,-7,0,-128,0,-128,0,1,0,127,0,90,0,-128)); + __m256i test_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_packs_epi16 // CHECK: @llvm.x86.avx2.packsswb // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_packs_epi16(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_packs_epi16((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){130,-200,127,-128,300,-1000,42,-42,500,-500,7,-7,255,-255,127,-128},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,0,1,-1,127,-128,90,-90,-32768}),1,-128,3,-128,5,-128,7,-42,9,1,11,127,13,127,15,-128,49,-128,51,-7,53,-128,55,-128,57,1,59,127,61,90,63,-128)); __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_packus_epi32 @@ -1127,6 +1114,7 @@ __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_packus_epi32(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v8hu(_mm_mask_packus_epi32((__m128i)(__v8hu){1,2,3,4,5,6,7,8},(__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,32767,-32768},(__m128i)(__v4si){0,1,-1,65536}),1,0,3,0,5,1,7,65535)); __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_packus_epi32 @@ -1134,6 +1122,7 @@ __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_packus_epi32(__M,__A,__B); } +TEST_CONSTEXPR(match_v8hu(_mm_maskz_packus_epi32((__mmask8)0xAA,(__m128i)(__v4si){40000,-50000,32767,-32768},(__m128i)(__v4si){0,1,-1,65536}),0,0,0,0,0,1,0,65535)); __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_packus_epi32 @@ -1141,6 +1130,7 @@ __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_packus_epi32(__M,__A,__B); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_packus_epi32((__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,22222,-22222},(__m256i)(__v8si){0,1,-1,65536,40000,-40000,65535,0}),0,0,0,0,0,1,0,-1,0,0,0,0,0,0,0,0)); __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_packus_epi32 @@ -1148,6 +1138,7 @@ __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __ // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_packus_epi32(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_packus_epi32((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v8si){40000,-50000,32767,-32768,32768,-32769,22222,-22222},(__m256i)(__v8si){0,1,-1,65536,40000,-40000,65535,0}),1,0,3,0,5,1,7,-1,25,0,27,0,29,0,31,0)); __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_packus_epi16 @@ -1155,6 +1146,7 @@ __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_packus_epi16(__M,__A,__B); } +TEST_CONSTEXPR(match_v16qu(_mm_maskz_packus_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){-1,0,1,127,128,255,256,-200},(__m128i)(__v8hi){0,1,-1,255,-129,128,20000,-32768}),0,0,0,127,0,255,0,0,0,1,0,255,0,128,0,0)); __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_packus_epi16 @@ -1162,6 +1154,7 @@ __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m12 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_packus_epi16(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v16qu(_mm_mask_packus_epi16((__m128i)(__v16qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xAAAA,(__m128i)(__v8hi){-1,0,1,127,128,255,256,-200},(__m128i)(__v8hi){0,1,-1,255,-129,128,20000,-32768}),1,0,3,127,5,255,7,0,9,1,11,255,13,128,15,0)); __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_packus_epi16 @@ -1169,6 +1162,7 @@ __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_packus_epi16(__M,__A,__B); } +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_packus_epi16((__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),0,0,0,127,0,-1,0,0,0,1,0,-1,0,-128,0,0,0,42,0,-1,0,0,0,0,0,0,0,0,0,0,0,0)); __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_packus_epi16 @@ -1176,6 +1170,7 @@ __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __ // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_packus_epi16(__W,__M,__A,__B); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_packus_epi16((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v16hi){-1,0,1,127,128,255,256,-200,300,42,-42,500,20000,-32768,129,-129},(__m256i)(__v16hi){0,1,-1,255,-129,128,20000,-32768,32767,-32767,127,-128,30000,-30000,90,-90}),1,0,3,127,5,-1,7,0,9,1,11,-1,13,-128,15,0,49,42,51,-1,53,0,55,0,57,0,59,0,61,0,63,0)); __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_adds_epi8 @@ -1183,48 +1178,64 @@ __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_adds_epi8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v16qi(_mm_mask_adds_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,-100,-50,+100,-20,-80,+120,-120,-20},(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,+50,+80,-50,+110,+60,120,+20,-120}),1,+2,3,+6,5,+10,7,+14,57,+30,59,+90,61,+127,63,-128)); + __m128i test_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_adds_epi8 // CHECK: @llvm.sadd.sat.v16i8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_adds_epi8(__U,__A,__B); } +TEST_CONSTEXPR(match_v16qi(_mm_maskz_adds_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,-100,-50,+100,-20,-80,+120,-120,-20},(__m128i)(__v16qs){0,+1,-2,+3,-4,+5,-6,+7,+50,+80,-50,+110,+60,120,+20,-120}),0,+2,0,+6,0,+10,0,+14,0,+30,0,+90,0,+127,0,-128)); + __m256i test_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_adds_epi8 // CHECK: @llvm.sadd.sat.v32i8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_adds_epi8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_adds_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),1,+2,3,+6,5,+10,7,+14,9,+18,11,+22,13,+26,15,+30,49,+127,51,+127,53,-80,+55,-30,57,+30,59,+90,61,+20,63,+10)); + __m256i test_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_adds_epi8 // CHECK: @llvm.sadd.sat.v32i8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_adds_epi8(__U,__A,__B); } +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_adds_epi8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+100,+50,-100,+20,+80,-50,+120,-20,-100,-50,+100,-20,-80,+50,-120,+20},(__m256i)(__v32qs){0,+1,-2,+3,-4,+5,-6,+7,-8,+9,-10,+11,-12,+13,-14,+15,+50,+80,-50,+110,+60,-30,+20,-10,+50,+80,-50,+110,+60,-30,+20,-10}),0,+2,0,+6,0,+10,0,+14,0,+18,0,+22,0,+26,0,+30,0,+127,0,+127,0,-80,0,-30,0,+30,0,+90,0,+20,0,+10)); + __m128i test_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_adds_epi16 // CHECK: @llvm.sadd.sat.v8i16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_adds_epi16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_adds_epi16((__m128i)(__v8hi){9,10,11,12,13,14,15,16,},(__mmask8)0xAA,(__m128i)(__v8hi){-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m128i)(__v8hi){-24,+25,-26,+27,+800,-800,-800,+800}),9,+50,11,+54,13,-32768,15,+32767)); + __m128i test_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_adds_epi16 // CHECK: @llvm.sadd.sat.v8i16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_adds_epi16(__U,__A,__B); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_adds_epi16((__mmask8)0xAA,(__m128i)(__v8hi){-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m128i)(__v8hi){-24,+25,-26,+27,+800,-800,-800,+800}),0,+50,0,+54,0,-32768,0,+32767)); + __m256i test_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_adds_epi16 // CHECK: @llvm.sadd.sat.v16i16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_adds_epi16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_adds_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,},(__mmask16)0xAAAA,(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+800,-800,-800,+800}),1,+2,3,+6,5,+10,7,+14,9,+50,11,+54,13,-32768,15,+32767)); + __m256i test_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_adds_epi16 // CHECK: @llvm.sadd.sat.v16i16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_adds_epi16(__U,__A,__B); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_adds_epi16((__mmask16)0xAAAA,(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+32000,-32000,+32000,+32000},(__m256i)(__v16hi){0,+1,-2,+3,-4,+5,-6,+7,-24,+25,-26,+27,+800,-800,-800,+800}),0,+2,0,+6,0,+10,0,+14,0,+50,0,+54,0,-32768,0,+32767)); + __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_adds_epu8 // CHECK-NOT: @llvm.x86.sse2.paddus.b @@ -1232,6 +1243,8 @@ __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_adds_epu8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v16qu(_mm_mask_adds_epu8((__m128i)(__v16qu){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qu){0,0,0,0,0,0,0,0,+255,+255,+255,+255,+255,+255,+255,+255},(__m128i)(__v16qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,57,+255,59,+255,61,+255,63,+255)); + __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_adds_epu8 // CHECK-NOT: @llvm.x86.sse2.paddus.b @@ -1239,6 +1252,8 @@ __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_adds_epu8(__U,__A,__B); } +TEST_CONSTEXPR(match_v16qu(_mm_maskz_adds_epu8((__mmask16)0xAAAA,(__m128i)(__v16qu){0,0,0,0,0,0,0,0,+255,+255,+255,+255,+255,+255,+255,+255},(__m128i)(__v16qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+255,0,+255,0,+255,0,+255)); + __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_adds_epu8 // CHECK-NOT: @llvm.x86.avx2.paddus.b @@ -1246,6 +1261,8 @@ __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m25 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_adds_epu8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v32qu(_mm256_mask_adds_epu8((__m256i)(__v32qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m256i)(__v32qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),1,+63,3,+127,5,+191,7,+255,9,+126,11,+190,13,+254,15,+255,49,+255,51,+255,53,+255,55,+255,57,+255,59,+255,61,+255,63,+255)); + __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_adds_epu8 // CHECK-NOT: @llvm.x86.avx2.paddus.b @@ -1253,6 +1270,8 @@ __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_adds_epu8(__U,__A,__B); } +TEST_CONSTEXPR(match_v32qu(_mm256_maskz_adds_epu8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,0,0,0,0,0,0,0,+63,+63,+63,+63,+63,+63,+63,+63,+192,+192,+192,+192,+192,+192,+192,+192,+255,+255,+255,+255,+255,+255,+255,+255},(__m256i)(__v32qu){0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255,0,+63,+64,+127,+128,+191,+192,+255}),0,+63,0,+127,0,+191,0,+255,0,+126,0,+190,0,+254,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255,0,+255)); + __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_adds_epu16 // CHECK-NOT: @llvm.x86.sse2.paddus.w @@ -1260,6 +1279,8 @@ __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_adds_epu16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v8hu(_mm_mask_adds_epu16((__m128i)(__v8hu){25,26,27,28,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v8hu){+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152},(__m128i)(__v8hu){0,+16384,+32767,+32768,+32767,+32768,+49152,+65535}),25,+32768,27,+49152,29,+65535,31,+65535)); + __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_adds_epu16 // CHECK-NOT: @llvm.x86.sse2.paddus.w @@ -1267,6 +1288,8 @@ __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_adds_epu16(__U,__A,__B); } +TEST_CONSTEXPR(match_v8hu(_mm_maskz_adds_epu16((__mmask8)0xAA,(__m128i)(__v8hu){+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152},(__m128i)(__v8hu){0,+16384,+32767,+32768,+32767,+32768,+49152,+65535}),0,+32768,0,+49152,0,+65535,0,+65535)); + __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_adds_epu16 // CHECK-NOT: @llvm.x86.avx2.paddus.w @@ -1274,6 +1297,8 @@ __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m2 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_adds_epu16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v16hu(_mm256_mask_adds_epu16((__m256i)(__v16hu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v16hu){0,0,0,0,+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m256i)(__v16hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),1,+32767,3,+65535,5,+32768,7,+49152,25,+65535,27,+65535,29,+65535,31,+65535)); + __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_adds_epu16 // CHECK-NOT: @llvm.x86.avx2.paddus.w @@ -1281,6 +1306,8 @@ __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_adds_epu16(__U,__A,__B); } +TEST_CONSTEXPR(match_v16hu(_mm256_maskz_adds_epu16((__mmask16)0xAAAA,(__m256i)(__v16hu){0,0,0,0,+16384,+16384,+16384,+16384,+49152,+49152,+49152,+49152,+65535,+65535,+65535,+65535},(__m256i)(__v16hu){0,+32767,+32768,+65535,0,+16384,+32767,+32768,+32767,+32768,+49152,+65535,0,+32767,+32768,+65535}),0,+32767,0,+65535,0,+32768,0,+49152,0,+65535,0,+65535,0,+65535,0,+65535)); + __m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_avg_epu8 // CHECK: @llvm.x86.sse2.pavg.b @@ -1740,48 +1767,64 @@ __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_subs_epi8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v16qi(_mm_mask_subs_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qs){1,-100,3,4,5,-6,7,100,57,-100,59,60,61,-62,63,100},(__m128i)(__v16qs){1,100,3,4,5,6,7,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,57,-128,59,0,61,-124,63,127)); + __m128i test_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_subs_epi8 // CHECK: @llvm.ssub.sat.v16i8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_subs_epi8(__U,__A,__B); } +TEST_CONSTEXPR(match_v16qi(_mm_maskz_subs_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){1,-100,3,4,5,-6,7,100,57,-100,59,60,61,-62,63,100},(__m128i)(__v16qs){1,100,3,4,5,6,7,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-124,0,127)); + __m256i test_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_subs_epi8 // CHECK: @llvm.ssub.sat.v32i8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_subs_epi8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_subs_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m256i)(__v32qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),1,-128,3,0,5,-12,7,127,9,-128,11,0,13,-28,15,127,49,-128,51,0,53,-108,55,127,57,-128,59,0,61,-124,63,127)); + __m256i test_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_subs_epi8 // CHECK: @llvm.ssub.sat.v32i8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_subs_epi8(__U,__A,__B); } +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_subs_epi8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qs){1,-100,3,4,5,-6,7,100,9,-100,11,12,13,-14,15,100,49,-100,51,52,53,-54,55,100,57,-100,59,60,61,-62,63,100},(__m256i)(__v32qs){1,100,3,4,5,6,7,-100,9,100,11,12,13,14,15,-100,49,100,51,52,53,54,55,-100,57,100,59,60,61,62,63,-100}),0,-128,0,0,0,-12,0,127,0,-128,0,0,0,-28,0,127,0,-128,0,0,0,-108,0,127,0,-128,0,0,0,-124,0,127)); + __m128i test_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_subs_epi16 // CHECK: @llvm.ssub.sat.v8i16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_subs_epi16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_subs_epi16((__m128i)(__v8hi){1,2,3,4,29,30,31,32},(__mmask8)0xAA,(__m128i)(__v8hi){1,-30000,3,30000,29,-30,31,32},(__m128i)(__v8hi){1,30000,3,-30000,29,30,31,-32}),1,-32768,3,32767,29,-60,31,64)); + __m128i test_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_subs_epi16 // CHECK: @llvm.ssub.sat.v8i16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_subs_epi16(__U,__A,__B); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_subs_epi16((__mmask8)0xAA,(__m128i)(__v8hi){1,-30000,3,30000,29,-30,31,32},(__m128i)(__v8hi){1,30000,3,-30000,29,30,31,-32}),0,-32768,0,32767,0,-60,0,64)); + __m256i test_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_subs_epi16 // CHECK: @llvm.ssub.sat.v16i16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_subs_epi16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_subs_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,(__m256i)(__v16hi){1,-30000,3,30000,5,-6,7,8,25,-30000,27,30000,29,-30,31,32},(__m256i)(__v16hi){1,30000,3,-30000,5,6,7,-8,25,30000,27,-30000,29,30,31,-32}),1,-32768,3,32767,5,-12,7,16,25,-32768,27,32767,29,-60,31,64)); + __m256i test_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_subs_epi16 // CHECK: @llvm.ssub.sat.v16i16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_subs_epi16(__U,__A,__B); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_subs_epi16((__mmask16)0xAAAA,(__m256i)(__v16hi){1,-30000,3,30000,5,-6,7,8,25,-30000,27,30000,29,-30,31,32},(__m256i)(__v16hi){1,30000,3,-30000,5,6,7,-8,25,30000,27,-30000,29,30,31,-32}),0,-32768,0,32767,0,-12,0,16,0,-32768,0,32767,0,-60,0,64)); + __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_subs_epu8 // CHECK-NOT: @llvm.x86.sse2.psubus.b @@ -1789,6 +1832,8 @@ __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_subs_epu8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v16qu(_mm_mask_subs_epu8((__m128i)(__v16qu){1,2,3,4,5,6,7,8,57,58,59,60,61,62,63,64},(__mmask16)0xAAAA,(__m128i)(__v16qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m128i)(__v16qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,57,0,59,1,61,1,63,0)); + __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_subs_epu8 // CHECK-NOT: @llvm.x86.sse2.psubus.b @@ -1796,6 +1841,8 @@ __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_subs_epu8(__U,__A,__B); } +TEST_CONSTEXPR(match_v16qu(_mm_maskz_subs_epu8((__mmask16)0xAAAA,(__m128i)(__v16qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m128i)(__v16qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0)); + __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_subs_epu8 // CHECK-NOT: @llvm.x86.avx2.psubus.b @@ -1803,6 +1850,8 @@ __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m25 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_subs_epu8(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v32qu(_mm256_mask_subs_epu8((__m256i)(__v32qu){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m256i)(__v32qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),1,200,3,0,5,0,7,254,9,0,11,1,13,1,15,0,49,200,51,0,53,0,55,254,57,0,59,1,61,1,63,0)); + __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_subs_epu8 // CHECK-NOT: @llvm.x86.avx2.psubus.b @@ -1810,6 +1859,8 @@ __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_subs_epu8(__U,__A,__B); } +TEST_CONSTEXPR(match_v32qu(_mm256_maskz_subs_epu8((__mmask32)0xAAAAAAAA,(__m256i)(__v32qu){0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255,0,250,0,128,0,20,0,255,0,0,0,1,0,100,0,255},(__m256i)(__v32qu){0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255,0,50,0,128,0,30,0,1,0,1,0,0,0,99,0,255}),0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0,0,200,0,0,0,0,0,254,0,0,0,1,0,1,0,0)); + __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_subs_epu16 // CHECK-NOT: @llvm.x86.sse2.psubus.w @@ -1817,6 +1868,8 @@ __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_subs_epu16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v8hu(_mm_mask_subs_epu16((__m128i)(__v8hu){101,102,103,104,129,130,131,132},(__mmask8)0xAAu,(__m128i)(__v8hu){0,65000,0,40000,0,1,0,50000},(__m128i)(__v8hu){0,5000,0,60000,0,0,0,25000}),101,60000,103,0,129,1,131,25000)); + __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_subs_epu16 // CHECK-NOT: @llvm.x86.sse2.psubus.w @@ -1824,6 +1877,8 @@ __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_subs_epu16(__U,__A,__B); } +TEST_CONSTEXPR(match_v8hu(_mm_maskz_subs_epu16((__mmask8)0xAAu,(__m128i)(__v8hu){0,65000,0,40000,0,1,0,50000},(__m128i)(__v8hu){0,5000,0,60000,0,0,0,25000}),0,60000,0,0,0,1,0,25000)); + __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_subs_epu16 // CHECK-NOT: @llvm.x86.avx2.psubus.w @@ -1831,6 +1886,8 @@ __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m2 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_subs_epu16(__W,__U,__A,__B); } +TEST_CONSTEXPR(match_v16hu(_mm256_mask_subs_epu16((__m256i)(__v16hu){101,102,103,104,105,106,107,108,125,126,127,128,129,130,131,132},(__mmask16)0xAAAAu,(__m256i)(__v16hu){0,65000,0,40000,0,100,0,65535,0,0,0,1000,0,1,0,50000},(__m256i)(__v16hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),101,60000,103,0,105,0,107,65534,125,0,127,0,129,1,131,25000)); + __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_subs_epu16 // CHECK-NOT: @llvm.x86.avx2.psubus.w @@ -1838,7 +1895,7 @@ __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_subs_epu16(__U,__A,__B); } - +TEST_CONSTEXPR(match_v16hu(_mm256_maskz_subs_epu16((__mmask16)0xAAAAu,(__m256i)(__v16hu){0,65000,0,40000,0,100,10,65535,0,0,0,1000,0,1,10000,50000},(__m256i)(__v16hu){0,5000,0,40000,0,200,0,1,0,1,0,65535,0,0,0,25000}),0,60000,0,0,0,0,0,65534,0,0,0,0,0,1,0,25000)); __m128i test_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { // CHECK-LABEL: test_mm_mask2_permutex2var_epi16 @@ -2233,6 +2290,7 @@ __m128i test_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m1 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_unpackhi_epi8(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v16qi(_mm_mask_unpackhi_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,114,-15,115,-16)); __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_unpackhi_epi8 @@ -2240,6 +2298,7 @@ __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_unpackhi_epi8(__U, __A, __B); } +TEST_CONSTEXPR(match_v16qi(_mm_maskz_unpackhi_epi8((__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,114,-15,115,-16)); __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_unpackhi_epi8 @@ -2247,6 +2306,7 @@ __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, _ // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_unpackhi_epi8(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_unpackhi_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),1,-9,3,-10,5,-11,7,-12,9,-13,11,-14,13,-15,15,-16,49,-57,51,-58,53,-59,55,-60,57,-61,59,-62,-90,-63,-89,-64)); __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_unpackhi_epi8 @@ -2254,6 +2314,7 @@ __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_unpackhi_epi8(__U, __A, __B); } +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_unpackhi_epi8((__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-49,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64}),0,-9,0,-10,0,-11,0,-12,0,-13,0,-14,0,-15,0,-16,0,-57,0,-58,0,-59,0,-60,0,-61,0,-62,-90,-63,-89,-64)); __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_unpackhi_epi16 @@ -2261,6 +2322,7 @@ __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m1 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_unpackhi_epi16(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_unpackhi_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xFA,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),1,204,3,205,106,206,107,207)); __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_unpackhi_epi16 @@ -2268,6 +2330,7 @@ __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_unpackhi_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_unpackhi_epi16((__mmask8)0xFA,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),0,204,0,205,106,206,107,207)); __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_unpackhi_epi16 @@ -2275,6 +2338,7 @@ __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_unpackhi_epi16(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_unpackhi_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),1,204,3,205,5,206,7,207,25,234,27,235,136,236,137,237)); __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_unpackhi_epi16 @@ -2282,6 +2346,7 @@ __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_unpackhi_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_unpackhi_epi16((__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),0,204,0,205,0,206,0,207,0,234,0,235,136,236,137,237)); __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_unpacklo_epi8 @@ -2289,6 +2354,7 @@ __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m1 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_unpacklo_epi8(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v16qi(_mm_mask_unpacklo_epi8((__m128i)(__v16qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,106,-7,107,-8)); __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_unpacklo_epi8 @@ -2296,6 +2362,7 @@ __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_unpacklo_epi8(__U, __A, __B); } +TEST_CONSTEXPR(match_v16qi(_mm_maskz_unpacklo_epi8((__mmask16)0xFAAA,(__m128i)(__v16qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115},(__m128i)(__v16qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,106,-7,107,-8)); __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_unpacklo_epi8 @@ -2303,6 +2370,7 @@ __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, _ // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_unpacklo_epi8(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_unpacklo_epi8((__m256i)(__v32qs){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),1,-1,3,-2,5,-3,7,-4,9,-5,11,-6,13,-7,15,-8,49,60,51,61,53,62,55,63,57,64,59,65,-56,66,-57,67)); __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_unpacklo_epi8 @@ -2310,6 +2378,7 @@ __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_unpacklo_epi8(__U, __A, __B); } +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_unpacklo_epi8((__mmask32)0xFAAAAAAA,(__m256i)(__v32qs){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,-50,-51,-52,-53,-54,-55,-56,-57,-58,-59,-60,-61,-62,-63,-64,-65},(__m256i)(__v32qs){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75}),0,-1,0,-2,0,-3,0,-4,0,-5,0,-6,0,-7,0,-8,0,60,0,61,0,62,0,63,0,64,0,65,-56,66,-57,67)); __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_unpacklo_epi16 @@ -2317,6 +2386,7 @@ __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m1 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_unpacklo_epi16(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_unpacklo_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xFAu,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),1,200,3,201,102,202,103,203)); __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_unpacklo_epi16 @@ -2324,6 +2394,7 @@ __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_unpacklo_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_unpacklo_epi16((__mmask8)0xFAu,(__m128i)(__v8hi){100,101,102,103,104,105,106,107},(__m128i)(__v8hi){200,201,202,203,204,205,206,207}),0,200,0,201,102,202,103,203)); __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_unpacklo_epi16 @@ -2331,6 +2402,7 @@ __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_unpacklo_epi16(__W, __U, __A, __B); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_unpacklo_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),1,200,3,201,5,202,7,203,25,230,27,231,132,232,133,233)); __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_unpacklo_epi16 @@ -2338,6 +2410,7 @@ __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_unpacklo_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_unpacklo_epi16((__mmask16)0xFAAAu,(__m256i)(__v16hi){100,101,102,103,104,105,106,107,130,131,132,133,134,135,136,137},(__m256i)(__v16hi){200,201,202,203,204,205,206,207,230,231,232,233,234,235,236,237}),0,200,0,201,0,202,0,203,0,230,0,231,132,232,133,233)); __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_cvtepi8_epi16 @@ -2345,6 +2418,7 @@ __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_cvtepi8_epi16(__W, __U, __A); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_cvtepi8_epi16(_mm_set1_epi16(-777),(__mmask8)0xA5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,9,10,11,12,13,14,15,16}),1,-777,3,-777,-777,-6,-777,-8)); __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_maskz_cvtepi8_epi16 @@ -2352,6 +2426,7 @@ __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_cvtepi8_epi16(__U, __A); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_cvtepi8_epi16((__mmask8)0xA5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,9,10,11,12,13,14,15,16}),1,0,3,0,0,-6,0,-8)); __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { // CHECK-LABEL: test_mm256_mask_cvtepi8_epi16 @@ -2359,6 +2434,7 @@ __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_cvtepi8_epi16(__W, __U, __A); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_cvtepi8_epi16(_mm256_set1_epi16(-777),/*1001110010100101=*/0x9ca5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,25,-26,27,-28,29,-30,31,-32}),1,-777,3,-777,-777,-6,-777,-8,-777,-777,27,-28,29,-777,-777,-32)); __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { // CHECK-LABEL: test_mm256_maskz_cvtepi8_epi16 @@ -2366,6 +2442,7 @@ __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_cvtepi8_epi16(__U, __A); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_cvtepi8_epi16(/*1001110010100101=*/0x9ca5,(__m128i)(__v16qs){1,-2,3,-4,5,-6,7,-8,25,-26,27,-28,29,-30,31,-32}),1,0,3,0,0,-6,0,-8,0,0,27,-28,29,0,0,-32)); __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_cvtepu8_epi16 @@ -2373,6 +2450,7 @@ __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_cvtepu8_epi16(__W, __U, __A); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_cvtepu8_epi16(_mm_set1_epi16(-777),(__mmask8)0xA5,(__m128i)(__v16qu){25,26,27,28,29,30,31,32,0,0,0,0,0,0,0,0}),25,-777,27,-777,-777,30,-777,32)); __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_maskz_cvtepu8_epi16 @@ -2380,6 +2458,7 @@ __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_cvtepu8_epi16(__U, __A); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_cvtepu8_epi16((__mmask8)0xA5,(__m128i)(__v16qu){25,26,27,28,29,30,31,32,0,0,0,0,0,0,0,0}),25,0,27,0,0,30,0,32)); __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { // CHECK-LABEL: test_mm256_mask_cvtepu8_epi16 @@ -2387,6 +2466,7 @@ __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_cvtepu8_epi16(__W, __U, __A); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_cvtepu8_epi16(_mm256_set1_epi16(-777),/*1001110010100101=*/0x9ca5,(__m128i)(__v16qu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32}),1,-777,3,-777,-777,6,-777,8,-777,-777,27,28,29,-777,-777,32)); __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) { // CHECK-LABEL: test_mm256_maskz_cvtepu8_epi16 @@ -2394,6 +2474,7 @@ __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_cvtepu8_epi16(__U, __A); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_cvtepu8_epi16(/*1001110010100101=*/0x9ca5,(__m128i)(__v16qu){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32}),1,0,3,0,0,6,0,8,0,0,27,28,29,0,0,32)); __m256i test_mm256_sllv_epi16(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_sllv_epi16 @@ -2468,6 +2549,7 @@ __m256i test_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_sll_epi16(__U, __A, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_slli_epi16((__mmask8)0xAA, (__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, 20), 0, 0, 0, 0, 0, 0, 0, 0)); __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_slli_epi16 @@ -2475,6 +2557,7 @@ __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_slli_epi16(__W, __U, __A, 5); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_slli_epi16((__m128i)(__v8hi){100, 101, 102, 103, 104, 105, 106, 107}, (__mmask8)0xAA, (__m128i)(__v8hi){0, 1, 2, 3, 4, 5, 6, 7}, 20), 100, 0, 102, 0, 104, 0, 106, 0)); __m128i test_mm_mask_slli_epi16_2(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { // CHECK-LABEL: test_mm_mask_slli_epi16_2 @@ -3152,6 +3235,7 @@ __m128i test_mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_broadcastb_epi8(__O, __M, __A); } +TEST_CONSTEXPR(match_v16qi(_mm_mask_broadcastb_epi8((__m128i)(__v16qs){0,1,2,3,4,5,6,7,56,57,58,59,60,61,62,63},(__mmask16)0xAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,56,-120,58,-120,60,-120,62,-120)); __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) { // CHECK-LABEL: test_mm_maskz_broadcastb_epi8 @@ -3159,6 +3243,7 @@ __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_broadcastb_epi8(__M, __A); } +TEST_CONSTEXPR(match_v16qi(_mm_maskz_broadcastb_epi8((__mmask16)0xAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120)); __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) { // CHECK-LABEL: test_mm256_mask_broadcastb_epi8 @@ -3166,6 +3251,7 @@ __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_broadcastb_epi8(__O, __M, __A); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_broadcastb_epi8((__m256i)(__v32qs){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63},(__mmask32)0xAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,2,-120,4,-120,6,-120,8,-120,10,-120,12,-120,14,-120,48,-120,50,-120,52,-120,54,-120,56,-120,58,-120,60,-120,62,-120)); __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) { // CHECK-LABEL: test_mm256_maskz_broadcastb_epi8 @@ -3173,6 +3259,7 @@ __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_broadcastb_epi8(__M, __A); } +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_broadcastb_epi8((__mmask32)0xAAAAAAAA,(__m128i)(__v16qs){-120,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120)); __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) { // CHECK-LABEL: test_mm_mask_broadcastw_epi16 @@ -3180,6 +3267,7 @@ __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_broadcastw_epi16(__O, __M, __A); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_broadcastw_epi16((__m128i)(__v8hi){0,1,2,3,4,5,6,7},(__mmask8)0xAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120)); __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) { // CHECK-LABEL: test_mm_maskz_broadcastw_epi16 @@ -3187,6 +3275,7 @@ __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_broadcastw_epi16(__M, __A); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_broadcastw_epi16((__mmask8)0xAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120)); __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) { // CHECK-LABEL: test_mm256_mask_broadcastw_epi16 @@ -3194,6 +3283,7 @@ __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_broadcastw_epi16(__O, __M, __A); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_broadcastw_epi16((__m256i)(__v16hi){0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31},(__mmask16)0xAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,2,-120,4,-120,6,-120,24,-120,26,-120,28,-120,30,-120)); __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) { // CHECK-LABEL: test_mm256_maskz_broadcastw_epi16 @@ -3201,6 +3291,8 @@ __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_broadcastw_epi16(__M, __A); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_broadcastw_epi16((__mmask16)0xAAAA,(__m128i)(__v8hi){-120,1,2,3,4,5,6,7}),0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120,0,-120)); + __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ // CHECK-LABEL: test_mm_mask_set1_epi8 // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0 @@ -3222,6 +3314,8 @@ __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_set1_epi8(__O, __M, __A); } +TEST_CONSTEXPR(match_v16qi(_mm_mask_set1_epi8((__m128i)(__v16qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},(__mmask16)0xAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42)); + __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ // CHECK-LABEL: test_mm_maskz_set1_epi8 // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0 @@ -3243,6 +3337,7 @@ __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_set1_epi8( __M, __A); } +TEST_CONSTEXPR(match_v16qi(_mm_maskz_set1_epi8((__mmask16)0xAAAA,(char)42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42)); __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { // CHECK-LABEL: test_mm256_mask_set1_epi8 @@ -3281,6 +3376,7 @@ __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_set1_epi8(__O, __M, __A); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_set1_epi8((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64},(__mmask32)0xAAAAAAAA,(char)42),1,42,3,42,5,42,7,42,9,42,11,42,13,42,15,42,49,42,51,42,53,42,55,42,57,42,59,42,61,42,63,42)); __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) { // CHECK-LABEL: test_mm256_maskz_set1_epi8 @@ -3319,7 +3415,7 @@ __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) { // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_set1_epi8( __M, __A); } - +TEST_CONSTEXPR( match_v32qi( _mm256_maskz_set1_epi8( (__mmask32)0xAAAAAAAA, (char)42 ), 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42 ) ); __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) { // CHECK-LABEL: test_mm256_mask_set1_epi16 @@ -3342,6 +3438,7 @@ __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_set1_epi16(__O, __M, __A); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_set1_epi16((__m256i)(__v16hi){1,2,3,4,5,6,7,8,25,26,27,28,29,30,31,32},(__mmask16)0xAAAA,42),1,42,3,42,5,42,7,42,25,42,27,42,29,42,31,42)); __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) { // CHECK-LABEL: test_mm256_maskz_set1_epi16 @@ -3364,6 +3461,7 @@ __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) { // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_set1_epi16(__M, __A); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_set1_epi16((__mmask16)0xAAAA,42),0,42,0,42,0,42,0,42,0,42,0,42,0,42,0,42)); __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) { // CHECK-LABEL: test_mm_mask_set1_epi16 @@ -3378,6 +3476,7 @@ __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_set1_epi16(__O, __M, __A); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_set1_epi16((__m128i)(__v8hi){1,2,3,4,5,6,7,8},(__mmask8)0xAA,42),1,42,3,42,5,42,7,42)); __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) { // CHECK-LABEL: test_mm_maskz_set1_epi16 @@ -3392,6 +3491,8 @@ __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_set1_epi16(__M, __A); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_set1_epi16((__mmask8)0xAA,42),0,42,0,42,0,42,0,42)); + __m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.128 diff --git a/clang/test/DebugInfo/CXX/decl-member-call.cpp b/clang/test/DebugInfo/CXX/decl-member-call.cpp new file mode 100644 index 0000000000000..95758a2985c0c --- /dev/null +++ b/clang/test/DebugInfo/CXX/decl-member-call.cpp @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 -O1 -triple x86_64-unknown_unknown -emit-llvm \ +// RUN: -debug-info-kind=standalone -dwarf-version=5 %s -o - | FileCheck %s + +// Ensure both nonmember and member calls to declared function +// have attached `DISubprogram`s. + +int nonmember(int n); + +struct S { + int x; + int member(int n); +}; + +int main(int argc, char** argv) { + struct S s = {}; + int a = s.member(argc); + int b = nonmember(argc); + return a + b; +} + +// CHECK: declare !dbg ![[SP1:[0-9]+]] noundef i32 @_ZN1S6memberEi( +// CHECK: declare !dbg ![[SP2:[0-9]+]] noundef i32 @_Z9nonmemberi( + +// CHECK: ![[SP1]] = !DISubprogram(name: "member", linkageName: "_ZN1S6memberEi" +// CHECK: ![[SP2]] = !DISubprogram(name: "nonmember", linkageName: "_Z9nonmemberi" diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index 4c7ea5e338a13..3cab4c600b1b1 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -3300,6 +3300,72 @@ TEST_P(ImportExpr, ConceptNestedNonInstantiationDependentRequirement) { conceptDecl(has(requiresExpr(has(requiresExprBodyDecl()))))); } +TEST_P(ImportExpr, ImportSubstNonTypeTemplateParmPackExpr) { + MatchVerifier Verifier; + const char *Code = R"( + template struct X {}; + template struct Z {}; + + template struct E { + template using B = Z...>; + template E(B); + }; + using declToImport = E<1, 3>; + )"; + testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier, + typedefNameDecl(hasName("declToImport"))); +} + +TEST_P(ImportExpr, ImportCXXParenListInitExpr) { + MatchVerifier Verifier; + const char *Code = R"( + struct Node { + int val; + double d; + }; + Node* declToImport() { return new Node(2, 3.14); } + )"; + testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier, + functionDecl(hasName("declToImport"))); +} + +TEST_P(ImportExpr, ImportPseudoObjectExpr) { + MatchVerifier Verifier; + const char *Code = R"( + namespace std { + struct strong_ordering { + int n; + constexpr operator int() const { return n; } + static const strong_ordering less, equal, greater; + }; + constexpr strong_ordering strong_ordering::less{-1}, + strong_ordering::equal{0}, strong_ordering::greater{1}; + } + + struct A { + std::strong_ordering operator<=>(const A&) const; + }; + struct B { + bool operator==(const B&) const; + bool operator<(const B&) const; + }; + + template struct Cmp : T { + std::strong_ordering operator<=>(const Cmp&) const = default; + }; + + void use(...); + void declToImport() { + use( + Cmp() <=> Cmp(), + Cmp() <=> Cmp() + ); + } + )"; + testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier, + functionDecl(hasName("declToImport"))); +} + class ImportImplicitMethods : public ASTImporterOptionSpecificTestBase { public: static constexpr auto DefaultCode = R"( diff --git a/clang/www/c_status.html b/clang/www/c_status.html index 8220bc433d815..6e25ad7df01ce 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -324,7 +324,7 @@

C2y implementation status

Matching of Multi-Dimensional Arrays in Generic Selection Expressions
N3348 - Unknown + No The __COUNTER__ predefined macro diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h index 1e72e847137a3..d88e261f8c684 100644 --- a/llvm/include/llvm/Bitcode/BitcodeWriter.h +++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h @@ -20,7 +20,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/MemoryBufferRef.h" #include -#include #include namespace llvm { diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h index e3e9b2bb91e8a..f9382fa8d9577 100644 --- a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h +++ b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h @@ -12,7 +12,6 @@ #include "llvm/DebugInfo/DIContext.h" #include #include -#include namespace llvm { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h index be8cb927c26df..fc01afc6d8739 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h @@ -45,7 +45,6 @@ #include #include #include -#include #include namespace llvm { diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h index 33e6df0ecb873..862293c9666a7 100644 --- a/llvm/include/llvm/IR/DebugInfo.h +++ b/llvm/include/llvm/IR/DebugInfo.h @@ -108,7 +108,7 @@ class DebugInfoFinder { LLVM_ABI void processInstruction(const Module &M, const Instruction &I); /// Process a DILocalVariable. - LLVM_ABI void processVariable(DILocalVariable *DVI); + LLVM_ABI void processVariable(const DILocalVariable *DVI); /// Process debug info location. LLVM_ABI void processLocation(const Module &M, const DILocation *Loc); /// Process a DbgRecord. @@ -124,7 +124,7 @@ class DebugInfoFinder { void processCompileUnit(DICompileUnit *CU); void processScope(DIScope *Scope); void processType(DIType *DT); - void processImportedEntity(DIImportedEntity *Import); + void processImportedEntity(const DIImportedEntity *Import); bool addCompileUnit(DICompileUnit *CU); bool addGlobalVariable(DIGlobalVariableExpression *DIG); bool addScope(DIScope *Scope); diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h index 0270f9f20e48c..20fa9d15bf5ff 100644 --- a/llvm/include/llvm/IR/DebugInfoMetadata.h +++ b/llvm/include/llvm/IR/DebugInfoMetadata.h @@ -2568,6 +2568,39 @@ class DISubprogram : public DILocalScope { replaceOperandWith(7, N.get()); } + /// For the given retained node of DISubprogram, applies one of the + /// given functions depending on the type of the node. + template + static T + visitRetainedNode(const Metadata *N, FuncLVT &&FuncLV, FuncLabelT &&FuncLabel, + FuncImportedEntityT &&FuncIE, FuncUnknownT &&FuncUnknown) { + if (const auto *LV = dyn_cast(N)) + return FuncLV(LV); + if (const auto *L = dyn_cast(N)) + return FuncLabel(L); + if (const auto *IE = dyn_cast(N)) + return FuncIE(IE); + return FuncUnknown(N); + } + + /// Returns the scope of subprogram's retainedNodes. + static const DILocalScope *getRetainedNodeScope(const MDNode *N); + // For use in Verifier. + static const DIScope *getRawRetainedNodeScope(const MDNode *N); + + /// For each retained node, applies one of the given functions depending + /// on the type of a node. + template + void forEachRetainedNode(FuncLVT &&FuncLV, FuncLabelT &&FuncLabel, + FuncImportedEntityT &&FuncIE) const { + for (MDNode *N : getRetainedNodes()) + visitRetainedNode(N, FuncLV, FuncLabel, FuncIE, + [](const Metadata *N) { + llvm_unreachable("Unexpected retained node!"); + }); + } + /// Check if this subprogram describes the given function. /// /// FIXME: Should this be looking through bitcasts? diff --git a/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h b/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h index 535635a9ad9b0..fcfb2db85a880 100644 --- a/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h +++ b/llvm/include/llvm/IR/MemoryModelRelaxationAnnotations.h @@ -21,7 +21,8 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Compiler.h" -#include // for std::pair + +#include namespace llvm { diff --git a/llvm/include/llvm/MC/DXContainerPSVInfo.h b/llvm/include/llvm/MC/DXContainerPSVInfo.h index 3a2d2949d0223..eb6d9e14d92c3 100644 --- a/llvm/include/llvm/MC/DXContainerPSVInfo.h +++ b/llvm/include/llvm/MC/DXContainerPSVInfo.h @@ -17,7 +17,6 @@ #include "llvm/TargetParser/Triple.h" #include -#include #include namespace llvm { diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h index 152b81e284c1a..dbae271a1c198 100644 --- a/llvm/include/llvm/MC/MCAssembler.h +++ b/llvm/include/llvm/MC/MCAssembler.h @@ -24,7 +24,6 @@ #include #include #include -#include #include namespace llvm { diff --git a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h index 69b8f9f000e1d..af9d809833023 100644 --- a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h +++ b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h @@ -16,7 +16,6 @@ #include "llvm/Remarks/RemarkSerializer.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/YAMLTraits.h" -#include namespace llvm { namespace remarks { diff --git a/llvm/include/llvm/Support/FormatVariadic.h b/llvm/include/llvm/Support/FormatVariadic.h index 85652924491ba..fdd448f7b5a3a 100644 --- a/llvm/include/llvm/Support/FormatVariadic.h +++ b/llvm/include/llvm/Support/FormatVariadic.h @@ -37,7 +37,6 @@ #include "llvm/Support/raw_ostream.h" #include #include -#include #include #include #include diff --git a/llvm/include/llvm/Support/Jobserver.h b/llvm/include/llvm/Support/Jobserver.h index 6bee3b5671d55..3c0c04537735d 100644 --- a/llvm/include/llvm/Support/Jobserver.h +++ b/llvm/include/llvm/Support/Jobserver.h @@ -68,7 +68,6 @@ #include "llvm/ADT/StringRef.h" #include -#include namespace llvm { diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h index a03a38466b27b..1a19eb94e60ea 100644 --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -24,7 +24,6 @@ #include "llvm/IR/ValueHandle.h" #include "llvm/Support/Compiler.h" #include "llvm/Transforms/Utils/ValueMapper.h" -#include #include namespace llvm { diff --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h index 12513c2a704f2..35c9adbe17677 100644 --- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h +++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h @@ -20,7 +20,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/Support/Compiler.h" -#include namespace llvm { diff --git a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h index cb48bb01e178a..19b573d6546a0 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h +++ b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h @@ -14,7 +14,6 @@ #define LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H #include -#include namespace llvm { diff --git a/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h b/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h index cfcd1611e27fe..47aa2ff5930b0 100644 --- a/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h +++ b/llvm/include/llvm/Transforms/Utils/SplitModuleByCategory.h @@ -16,7 +16,6 @@ #include #include -#include namespace llvm { diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 27114e0705a1d..033f516abe017 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -23,7 +23,6 @@ #include "llvm/Support/DXILABI.h" #include "llvm/Support/FormatVariadic.h" #include -#include #define DEBUG_TYPE "dxil-resource" diff --git a/llvm/lib/Analysis/TFLiteUtils.cpp b/llvm/lib/Analysis/TFLiteUtils.cpp index 2762e22f28cef..fcef1c8aa7380 100644 --- a/llvm/lib/Analysis/TFLiteUtils.cpp +++ b/llvm/lib/Analysis/TFLiteUtils.cpp @@ -30,7 +30,6 @@ #include "tensorflow/lite/logger.h" #include -#include #include using namespace llvm; diff --git a/llvm/lib/Analysis/TrainingLogger.cpp b/llvm/lib/Analysis/TrainingLogger.cpp index 344ca92e18b51..39f79cffdcd88 100644 --- a/llvm/lib/Analysis/TrainingLogger.cpp +++ b/llvm/lib/Analysis/TrainingLogger.cpp @@ -23,7 +23,6 @@ #include "llvm/Support/raw_ostream.h" #include -#include using namespace llvm; diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 7fbadad34058f..7588a47b2b975 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -1608,18 +1608,8 @@ void DwarfDebug::ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU, } static const DILocalScope *getRetainedNodeScope(const MDNode *N) { - const DIScope *S; - if (const auto *LV = dyn_cast(N)) - S = LV->getScope(); - else if (const auto *L = dyn_cast(N)) - S = L->getScope(); - else if (const auto *IE = dyn_cast(N)) - S = IE->getScope(); - else - llvm_unreachable("Unexpected retained node!"); - // Ensure the scope is not a DILexicalBlockFile. - return cast(S)->getNonLexicalBlockFileScope(); + return DISubprogram::getRetainedNodeScope(N)->getNonLexicalBlockFileScope(); } // Collect variable information from side table maintained by MF. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 74ccc34bd5a9d..7068eb440e31b 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1840,7 +1840,8 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) { /// lose; some adjustment may be wanted there. /// /// Return true if any changes are made. -static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { +static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI, + const DataLayout &DL) { if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType()))) return false; @@ -1848,6 +1849,18 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { if (TLI.useSoftFloat() && isa(Cmp)) return false; + bool UsedInPhiOrCurrentBlock = any_of(Cmp->users(), [Cmp](User *U) { + return isa(U) || + cast(U)->getParent() == Cmp->getParent(); + }); + + // Avoid sinking larger than legal integer comparisons unless its ONLY used in + // another BB. + if (UsedInPhiOrCurrentBlock && Cmp->getOperand(0)->getType()->isIntegerTy() && + Cmp->getOperand(0)->getType()->getScalarSizeInBits() > + DL.getLargestLegalIntTypeSizeInBits()) + return false; + // Only insert a cmp in each block once. DenseMap InsertedCmps; @@ -2225,7 +2238,7 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) { } bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { - if (sinkCmpExpression(Cmp, *TLI)) + if (sinkCmpExpression(Cmp, *TLI, *DL)) return true; if (combineToUAddWithOverflow(Cmp, ModifiedDT)) diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp index 7ae9e0e37bbab..cd951a1a4f53e 100644 --- a/llvm/lib/CodeGen/TargetSchedule.cpp +++ b/llvm/lib/CodeGen/TargetSchedule.cpp @@ -25,7 +25,6 @@ #include "llvm/Support/raw_ostream.h" #include #include -#include using namespace llvm; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp index a88f4a554bcf0..a4bdd1f0a867c 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFUnwindTablePrinter.cpp @@ -15,7 +15,6 @@ #include #include #include -#include using namespace llvm; using namespace dwarf; diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index c4aa2c7638450..45818deda8aa6 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -29,7 +29,6 @@ #include "llvm/Transforms/Utils/ModuleUtils.h" #include -#include #include using namespace llvm; diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index bf8aacc67cc10..ba3c039503720 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -247,7 +247,7 @@ void DebugInfoFinder::processType(DIType *DT) { } } -void DebugInfoFinder::processImportedEntity(DIImportedEntity *Import) { +void DebugInfoFinder::processImportedEntity(const DIImportedEntity *Import) { auto *Entity = Import->getEntity(); if (auto *T = dyn_cast(Entity)) processType(T); @@ -307,15 +307,13 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) { } } - for (auto *N : SP->getRetainedNodes()) { - if (auto *Var = dyn_cast_or_null(N)) - processVariable(Var); - else if (auto *Import = dyn_cast_or_null(N)) - processImportedEntity(Import); - } + SP->forEachRetainedNode( + [this](const DILocalVariable *LV) { processVariable(LV); }, + [](const DILabel *L) {}, + [this](const DIImportedEntity *IE) { processImportedEntity(IE); }); } -void DebugInfoFinder::processVariable(DILocalVariable *DV) { +void DebugInfoFinder::processVariable(const DILocalVariable *DV) { if (!NodesSeen.insert(DV).second) return; processScope(DV->getScope()); diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index 0f9e24fd99a5c..c2975e68a4f06 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -1441,6 +1441,19 @@ bool DISubprogram::describes(const Function *F) const { assert(F && "Invalid function"); return F->getSubprogram() == this; } + +const DIScope *DISubprogram::getRawRetainedNodeScope(const MDNode *N) { + return visitRetainedNode( + N, [](const DILocalVariable *LV) { return LV->getScope(); }, + [](const DILabel *L) { return L->getScope(); }, + [](const DIImportedEntity *IE) { return IE->getScope(); }, + [](const Metadata *N) { return nullptr; }); +} + +const DILocalScope *DISubprogram::getRetainedNodeScope(const MDNode *N) { + return cast(getRawRetainedNodeScope(N)); +} + DILexicalBlockBase::DILexicalBlockBase(LLVMContext &C, unsigned ID, StorageType Storage, ArrayRef Ops) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 427d91afe8c93..6031659e4002e 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -1575,11 +1575,27 @@ void Verifier::visitDISubprogram(const DISubprogram &N) { auto *Node = dyn_cast(RawNode); CheckDI(Node, "invalid retained nodes list", &N, RawNode); for (Metadata *Op : Node->operands()) { - CheckDI(Op && (isa(Op) || isa(Op) || - isa(Op)), + CheckDI(Op, "nullptr in retained nodes", &N, Node); + + auto True = [](const Metadata *) { return true; }; + auto False = [](const Metadata *) { return false; }; + bool IsTypeCorrect = + DISubprogram::visitRetainedNode(Op, True, True, True, False); + CheckDI(IsTypeCorrect, "invalid retained nodes, expected DILocalVariable, DILabel or " "DIImportedEntity", &N, Node, Op); + + auto *RetainedNode = cast(Op); + auto *RetainedNodeScope = dyn_cast_or_null( + DISubprogram::getRawRetainedNodeScope(RetainedNode)); + CheckDI(RetainedNodeScope, + "invalid retained nodes, retained node is not local", &N, Node, + RetainedNode); + CheckDI( + RetainedNodeScope->getSubprogram() == &N, + "invalid retained nodes, retained node does not belong to subprogram", + &N, Node, RetainedNode, RetainedNodeScope); } } CheckDI(!hasConflictingReferenceFlags(N.getFlags()), diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 6c140be59fc4b..8b95049eb9648 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -10,7 +10,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include -#include #include #define DEBUG_TYPE "mustache" diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc index ec785e407cc57..5dcd2c945bc85 100644 --- a/llvm/lib/Support/Windows/Program.inc +++ b/llvm/lib/Support/Windows/Program.inc @@ -23,7 +23,6 @@ #include #include #include -#include #include //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 068954f1764fb..0bf2b31b10846 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -54,7 +54,6 @@ #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include #include -#include using namespace llvm; diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h index 0daeeb8f11cfe..338a7c8082ca3 100644 --- a/llvm/lib/Target/AVR/AVRTargetTransformInfo.h +++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.h @@ -21,7 +21,6 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" -#include namespace llvm { diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 677203d1c016b..95577dd668e1e 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -29,7 +29,6 @@ #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include -#include using namespace llvm; using namespace llvm::dxil; diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index b990b6c7410ac..ec82aa93dd07c 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -21,7 +21,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/MC/DXContainerRootSignature.h" #include "llvm/Pass.h" -#include namespace llvm { namespace dxil { diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h index f2c00c7320e11..7cbc092ea3525 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.h @@ -19,7 +19,6 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/MemoryBufferRef.h" #include -#include #include namespace llvm { diff --git a/llvm/lib/Target/M68k/M68kSubtarget.h b/llvm/lib/Target/M68k/M68kSubtarget.h index 16ca7d2e6d0fd..4f9685814d9a9 100644 --- a/llvm/lib/Target/M68k/M68kSubtarget.h +++ b/llvm/lib/Target/M68k/M68kSubtarget.h @@ -27,8 +27,6 @@ #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Support/Alignment.h" -#include - #define GET_SUBTARGETINFO_HEADER #include "M68kGenSubtargetInfo.inc" diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index f275802fe1843..7d933588025fe 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -23,7 +23,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/TargetParser/Triple.h" -#include #define GET_SUBTARGETINFO_HEADER #include "PPCGenSubtargetInfo.inc" diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index fbb127df16dd9..b8cd9c1358f00 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -249,17 +249,18 @@ static InstrSignature instrToSignature(const MachineInstr &MI, InstrSignature Signature{MI.getOpcode()}; for (unsigned i = 0; i < MI.getNumOperands(); ++i) { // The only decorations that can be applied more than once to a given - // or structure member are UserSemantic(5635), CacheControlLoadINTEL (6442), - // and CacheControlStoreINTEL (6443). For all the rest of decorations, we - // will only add to the signature the Opcode, the id to which it applies, - // and the decoration id, disregarding any decoration flags. This will - // ensure that any subsequent decoration with the same id will be deemed as - // a duplicate. Then, at the call site, we will be able to handle duplicates - // in the best way. + // or structure member are FuncParamAttr (38), UserSemantic (5635), + // CacheControlLoadINTEL (6442), and CacheControlStoreINTEL (6443). For all + // the rest of decorations, we will only add to the signature the Opcode, + // the id to which it applies, and the decoration id, disregarding any + // decoration flags. This will ensure that any subsequent decoration with + // the same id will be deemed as a duplicate. Then, at the call site, we + // will be able to handle duplicates in the best way. unsigned Opcode = MI.getOpcode(); if ((Opcode == SPIRV::OpDecorate) && i >= 2) { unsigned DecorationID = MI.getOperand(1).getImm(); - if (DecorationID != SPIRV::Decoration::UserSemantic && + if (DecorationID != SPIRV::Decoration::FuncParamAttr && + DecorationID != SPIRV::Decoration::UserSemantic && DecorationID != SPIRV::Decoration::CacheControlLoadINTEL && DecorationID != SPIRV::Decoration::CacheControlStoreINTEL) continue; diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.h b/llvm/lib/Target/Sparc/SparcSubtarget.h index b1decca0a4f07..f575f6d7da37f 100644 --- a/llvm/lib/Target/Sparc/SparcSubtarget.h +++ b/llvm/lib/Target/Sparc/SparcSubtarget.h @@ -21,7 +21,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Triple.h" -#include #define GET_SUBTARGETINFO_HEADER #include "SparcGenSubtargetInfo.inc" diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 0c2bd7c302f33..d4ad98af9b30c 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -50,7 +50,6 @@ #include "llvm/Transforms/CFGuard.h" #include #include -#include using namespace llvm; diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index c06f33f726e76..c37e530f530f5 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1945,6 +1945,10 @@ void InstrLowerer::emitNameData() { NamesVar = new GlobalVariable(M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); + if (isGPUProfTarget(M)) { + NamesVar->setLinkage(GlobalValue::ExternalLinkage); + NamesVar->setVisibility(GlobalValue::ProtectedVisibility); + } NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 45b5570261416..566d6eafee63e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1232,6 +1232,30 @@ class LoopVectorizationCostModel { /// Superset of instructions that return true for isScalarWithPredication. bool isPredicatedInst(Instruction *I) const; + /// A helper function that returns how much we should divide the cost of a + /// predicated block by. Typically this is the reciprocal of the block + /// probability, i.e. if we return X we are assuming the predicated block will + /// execute once for every X iterations of the loop header so the block should + /// only contribute 1/X of its cost to the total cost calculation, but when + /// optimizing for code size it will just be 1 as code size costs don't depend + /// on execution probabilities. + /// + /// TODO: We should use actual block probability here, if available. + /// Currently, we always assume predicated blocks have a 50% chance of + /// executing, apart from blocks that are only predicated due to tail folding. + inline unsigned + getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, + BasicBlock *BB) const { + // If a block wasn't originally predicated but was predicated due to + // e.g. tail folding, don't divide the cost. Tail folded loops may still be + // predicated in the final vector loop iteration, but for most loops that + // don't have low trip counts we can expect their probability to be close to + // zero. + if (!Legal->blockNeedsPredication(BB)) + return 1; + return CostKind == TTI::TCK_CodeSize ? 1 : 2; + } + /// Return the costs for our two available strategies for lowering a /// div/rem operation which requires speculating at least one lane. /// First result is for scalarization (will be invalid for scalable @@ -2887,7 +2911,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind); + ScalarizationCost = + ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent()); } InstructionCost SafeDivisorCost = 0; @@ -5032,7 +5057,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( } // Scale the total scalar cost by block probability. - ScalarCost /= getPredBlockCostDivisor(CostKind); + ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent()); // Compute the discount. A non-negative discount means the vector version // of the instruction costs more, and scalarizing would be beneficial. @@ -5082,10 +5107,11 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { // stores and instructions that may divide by zero) will now be // unconditionally executed. For the scalar case, we may not always execute // the predicated block, if it is an if-else block. Thus, scale the block's - // cost by the probability of executing it. blockNeedsPredication from - // Legal is used so as to not include all blocks in tail folded loops. - if (VF.isScalar() && Legal->blockNeedsPredication(BB)) - BlockCost /= getPredBlockCostDivisor(CostKind); + // cost by the probability of executing it. + // getPredBlockCostDivisor will return 1 for blocks that are only predicated + // by the header mask when folding the tail. + if (VF.isScalar()) + BlockCost /= getPredBlockCostDivisor(CostKind, BB); Cost += BlockCost; } @@ -5164,7 +5190,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. if (isPredicatedInst(I)) { - Cost /= getPredBlockCostDivisor(CostKind); + Cost /= getPredBlockCostDivisor(CostKind, I->getParent()); // Add the cost of an i1 extract and a branch auto *VecI1Ty = @@ -6732,6 +6758,10 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { SkipCostComputation.contains(UI); } +unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const { + return CM.getPredBlockCostDivisor(CostKind, BB); +} + InstructionCost LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, VPCostContext &CostCtx) const { diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 965426f86ff21..caabfa7275b69 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step); -/// A helper function that returns how much we should divide the cost of a -/// predicated block by. Typically this is the reciprocal of the block -/// probability, i.e. if we return X we are assuming the predicated block will -/// execute once for every X iterations of the loop header so the block should -/// only contribute 1/X of its cost to the total cost calculation, but when -/// optimizing for code size it will just be 1 as code size costs don't depend -/// on execution probabilities. -/// -/// TODO: We should use actual block probability here, if available. Currently, -/// we always assume predicated blocks have a 50% chance of executing. -inline unsigned -getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) { - return CostKind == TTI::TCK_CodeSize ? 1 : 2; -} - /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 16) = {1, 2, 4, 8} @@ -367,6 +352,10 @@ struct VPCostContext { /// has already been pre-computed. bool skipCostComputation(Instruction *UI, bool IsVector) const; + /// \returns how much the cost of a predicated block should be divided by. + /// Forwards to LoopVectorizationCostModel::getPredBlockCostDivisor. + unsigned getPredBlockCostDivisor(BasicBlock *BB) const; + /// Returns the OperandInfo for \p V, if it is a live-in. TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 80cd112dbcd8a..707886f873fba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3349,7 +3349,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally // likely. - ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind); + ScalarCost /= Ctx.getPredBlockCostDivisor(UI->getParent()); return ScalarCost; } case Instruction::Load: diff --git a/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll b/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll new file mode 100644 index 0000000000000..d440d58246333 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/compute-known-bits-nofpclass.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +define i32 @known_positive(float nofpclass(nan ninf nzero nsub nnorm) %signbit.zero) #0 { +; CHECK-LABEL: known_positive: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cast = bitcast float %signbit.zero to i32 + %and = and i32 %cast, 2147483647 + ret i32 %and +} + +define i32 @known_positive_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %signbit.zero) #0 { +; CHECK-LABEL: known_positive_maybe_nan: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cast = bitcast float %signbit.zero to i32 + %and = and i32 %cast, 2147483647 + ret i32 %and +} + +define i32 @known_negative(float nofpclass(nan pinf pzero psub pnorm) %signbit.one) #0 { +; CHECK-LABEL: known_negative: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cast = bitcast float %signbit.one to i32 + %or = or i32 %cast, -2147483648 + ret i32 %or +} + +define i32 @known_negative_maybe_nan(float nofpclass(pinf pzero psub pnorm) %signbit.one) #0 { +; CHECK-LABEL: known_negative_maybe_nan: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_or_b32_e32 v0, 0x80000000, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cast = bitcast float %signbit.one to i32 + %or = or i32 %cast, -2147483648 + ret i32 %or +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index f5227eed458d6..12556817a66be 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -370,4 +370,112 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) { ret float %pow_sign1 } +define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 { +; GFX9-LABEL: test_pow_fast_f64integral_y: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s16, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-NEXT: v_writelane_b32 v43, s16, 15 +; GFX9-NEXT: s_addk_i32 s32, 0x800 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v43, s34, 0 +; GFX9-NEXT: v_writelane_b32 v43, s35, 1 +; GFX9-NEXT: v_writelane_b32 v43, s36, 2 +; GFX9-NEXT: v_writelane_b32 v43, s37, 3 +; GFX9-NEXT: v_writelane_b32 v43, s38, 4 +; GFX9-NEXT: v_writelane_b32 v43, s39, 5 +; GFX9-NEXT: v_writelane_b32 v43, s48, 6 +; GFX9-NEXT: v_writelane_b32 v43, s49, 7 +; GFX9-NEXT: v_writelane_b32 v43, s50, 8 +; GFX9-NEXT: v_writelane_b32 v43, s51, 9 +; GFX9-NEXT: v_writelane_b32 v43, s52, 10 +; GFX9-NEXT: v_writelane_b32 v43, s53, 11 +; GFX9-NEXT: v_writelane_b32 v43, s54, 12 +; GFX9-NEXT: v_writelane_b32 v43, s30, 13 +; GFX9-NEXT: v_writelane_b32 v43, s31, 14 +; GFX9-NEXT: v_mov_b32_e32 v42, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d@rel32@hi+12 +; GFX9-NEXT: v_mov_b32_e32 v40, v31 +; GFX9-NEXT: v_mov_b32_e32 v41, v2 +; GFX9-NEXT: s_mov_b32 s50, s15 +; GFX9-NEXT: s_mov_b32 s51, s14 +; GFX9-NEXT: s_mov_b32 s52, s13 +; GFX9-NEXT: s_mov_b32 s53, s12 +; GFX9-NEXT: s_mov_b64 s[34:35], s[10:11] +; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9] +; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7] +; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5] +; GFX9-NEXT: s_brev_b32 s54, -2 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v41 +; GFX9-NEXT: s_getpc_b64 s[16:17] +; GFX9-NEXT: s_add_u32 s16, s16, _Z4exp2d@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s17, s17, _Z4exp2d@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[4:5], s[48:49] +; GFX9-NEXT: s_mov_b64 s[6:7], s[38:39] +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_mov_b64 s[8:9], s[36:37] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s53 +; GFX9-NEXT: s_mov_b32 s13, s52 +; GFX9-NEXT: s_mov_b32 s14, s51 +; GFX9-NEXT: s_mov_b32 s15, s50 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v41 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v42 +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s30, v43, 13 +; GFX9-NEXT: v_bfi_b32 v1, s54, v1, v2 +; GFX9-NEXT: v_readlane_b32 s31, v43, 14 +; GFX9-NEXT: v_readlane_b32 s54, v43, 12 +; GFX9-NEXT: v_readlane_b32 s53, v43, 11 +; GFX9-NEXT: v_readlane_b32 s52, v43, 10 +; GFX9-NEXT: v_readlane_b32 s51, v43, 9 +; GFX9-NEXT: v_readlane_b32 s50, v43, 8 +; GFX9-NEXT: v_readlane_b32 s49, v43, 7 +; GFX9-NEXT: v_readlane_b32 s48, v43, 6 +; GFX9-NEXT: v_readlane_b32 s39, v43, 5 +; GFX9-NEXT: v_readlane_b32 s38, v43, 4 +; GFX9-NEXT: v_readlane_b32 s37, v43, 3 +; GFX9-NEXT: v_readlane_b32 s36, v43, 2 +; GFX9-NEXT: v_readlane_b32 s35, v43, 1 +; GFX9-NEXT: v_readlane_b32 s34, v43, 0 +; GFX9-NEXT: s_mov_b32 s32, s33 +; GFX9-NEXT: v_readlane_b32 s4, v43, 15 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs = call fast double @llvm.fabs.f64(double %x) + %log2 = call fast double @_Z4log2d(double %fabs) + %pownI2F = sitofp i32 %y.i to double + %ylogx = fmul fast double %log2, %pownI2F + %exp2 = call fast nofpclass(nan ninf nzero nsub nnorm) double @_Z4exp2d(double %ylogx) + %ytou = zext i32 %y.i to i64 + %yeven = shl i64 %ytou, 63 + %x.i64 = bitcast double %x to i64 + %pow_sign = and i64 %yeven, %x.i64 + %pow_sign.f64 = bitcast i64 %pow_sign to double + %pow_sign1 = call fast double @llvm.copysign.f64(double %exp2, double %pow_sign.f64) + ret double %pow_sign1 +} + +declare hidden double @_Z4exp2d(double) #1 +declare hidden double @_Z4log2d(double) #1 + attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #1 = { norecurse nounwind memory(read) } diff --git a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll new file mode 100644 index 0000000000000..b99b64316b62f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll @@ -0,0 +1,208 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +; Negative test, don't know %x is positive +define half @copysign_known_signmask_f16(half %x, i16 %sign) { +; GFX9-LABEL: copysign_known_signmask_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i16 %sign, 15 + %signmask.bitcast = bitcast i16 %signmask to half + %result = call half @llvm.copysign.f16(half %x, half %signmask.bitcast) + ret half %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %x, float %signmask.bitcast) + ret float %result +} + +; Negative test, don't know %x is positive +define double @copysign_known_signmask_f64(double %x, i64 %sign) { +; GFX9-LABEL: copysign_known_signmask_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i64 %sign, 63 + %signmask.bitcast = bitcast i64 %signmask to double + %result = call double @llvm.copysign.f64(double %x, double %signmask.bitcast) + ret double %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan(float nofpclass(ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_nan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +; Negative test, don't know %x is positive +define float @copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero(float nofpclass(nan ninf nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_not_known_positive_mag_maybe_negzero: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i16 %sign) { +; GFX9-LABEL: copysign_known_signmask_f16_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i16 %sign, 15 + %signmask.bitcast = bitcast i16 %signmask to half + %result = call half @llvm.copysign.f16(half %sign.bit.known.zero, half %signmask.bitcast) + ret half %result +} + +define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %sign.bit.known.zero, float %signmask.bitcast) + ret float %result +} + +define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i64 %sign) { +; GFX9-LABEL: copysign_known_signmask_f64_known_positive_mag: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i64 %sign, 63 + %signmask.bitcast = bitcast i64 %signmask to double + %result = call double @llvm.copysign.f64(double %sign.bit.known.zero, double %signmask.bitcast) + ret double %result +} + +; exp always returns a positive result, excluding the unknown nan sign +; bit. +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2aeac50 +; GFX9-NEXT: v_add_f32_e32 v2, 0x42800000, v0 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x114b4ea4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_not_b32_e32 v2, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %x, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag__nnan_exp10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x42800000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_exp_f32_e32 v0, v0 +; GFX9-NEXT: v_not_b32_e32 v2, 63 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x) + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %result = call float @llvm.copysign.f32(float %signbit.known.zero, float %signmask.bitcast) + ret float %result +} + +define float @copysign_known_signmask_f32_known_positive_mag_through_fence(float nofpclass(nan ninf nzero nsub nnorm) %sign.bit.known.zero, i32 %sign) { +; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag_through_fence: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX9-NEXT: ;ARITH_FENCE +; GFX9-NEXT: s_brev_b32 s4, -2 +; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %signmask = shl i32 %sign, 31 + %signmask.bitcast = bitcast i32 %signmask to float + %fence = call float @llvm.arithmetic.fence.f32(float %sign.bit.known.zero) + %result = call float @llvm.copysign.f32(float %fence, float %signmask.bitcast) + ret float %result +} diff --git a/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll new file mode 100644 index 0000000000000..1404385c4304c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/nofpclass-call.ll @@ -0,0 +1,199 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +; Check that nofpclass attributes on call returns are used in +; selectiondag. + +define internal float @func_f32(ptr addrspace(1) %ptr) { +; CHECK-LABEL: func_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_load_dword v0, v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ld = load volatile float, ptr addrspace(1) %ptr + ret float %ld +} + +define float @call_nofpclass_funcs_f32(ptr addrspace(1) %ptr) { +; CHECK-LABEL: call_nofpclass_funcs_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: v_writelane_b32 v4, s30, 0 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v4, s31, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, func_f32@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, func_f32@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v2, v0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, v2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_max_f32_e32 v1, v3, v3 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v0 +; CHECK-NEXT: v_readlane_b32 s30, v4, 0 +; CHECK-NEXT: v_min_f32_e32 v0, v1, v0 +; CHECK-NEXT: v_readlane_b32 s31, v4, 1 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) float @func_f32(ptr addrspace(1) %ptr) + %call1 = call nofpclass(nan) float @func_f32(ptr addrspace(1) %ptr) + %min = call float @llvm.minnum.f32(float %call0, float %call1) + ret float %min +} + +define internal <2 x float> @func_v2f32(ptr addrspace(1) %ptr) { +; CHECK-LABEL: func_v2f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ld = load volatile <2 x float>, ptr addrspace(1) %ptr + ret <2 x float> %ld +} + +define <2 x float> @call_nofpclass_funcs_v2f32(ptr addrspace(1) %ptr) { +; CHECK-LABEL: call_nofpclass_funcs_v2f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: v_writelane_b32 v6, s30, 0 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v6, s31, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v2, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mov_b32_e32 v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, v3 +; CHECK-NEXT: v_mov_b32_e32 v1, v2 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_max_f32_e32 v2, v4, v4 +; CHECK-NEXT: v_max_f32_e32 v0, v0, v0 +; CHECK-NEXT: v_min_f32_e32 v0, v2, v0 +; CHECK-NEXT: v_max_f32_e32 v2, v5, v5 +; CHECK-NEXT: v_max_f32_e32 v1, v1, v1 +; CHECK-NEXT: v_readlane_b32 s30, v6, 0 +; CHECK-NEXT: v_min_f32_e32 v1, v2, v1 +; CHECK-NEXT: v_readlane_b32 s31, v6, 1 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) <2 x float> @func_v2f32(ptr addrspace(1) %ptr) + %call1 = call nofpclass(nan) <2 x float> @func_v2f32(ptr addrspace(1) %ptr) + %min = call <2 x float> @llvm.minnum.v2f32(<2 x float> %call0, <2 x float> %call1) + ret <2 x float> %min +} + +define internal double @func_f64(ptr addrspace(1) %ptr) { +; CHECK-LABEL: func_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ld = load volatile double, ptr addrspace(1) %ptr + ret double %ld +} + +define double @call_nofpclass_funcs_f64(ptr addrspace(1) %ptr) { +; CHECK-LABEL: call_nofpclass_funcs_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s33, s32 +; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: s_mov_b64 exec, s[16:17] +; CHECK-NEXT: v_writelane_b32 v6, s30, 0 +; CHECK-NEXT: s_addk_i32 s32, 0x400 +; CHECK-NEXT: v_writelane_b32 v6, s31, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, func_f64@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, func_f64@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v5, v0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_mov_b32_e32 v2, v0 +; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, v5 +; CHECK-NEXT: v_mov_b32_e32 v1, v4 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] +; CHECK-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; CHECK-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; CHECK-NEXT: v_readlane_b32 s30, v6, 0 +; CHECK-NEXT: v_readlane_b32 s31, v6, 1 +; CHECK-NEXT: s_mov_b32 s32, s33 +; CHECK-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: s_mov_b64 exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) double @func_f64(ptr addrspace(1) %ptr) + %call1 = call nofpclass(nan) double @func_f64(ptr addrspace(1) %ptr) + %min = call double @llvm.minnum.f64(double %call0, double %call1) + ret double %min +} + +define float @call_nofpclass_intrinsic_f32(float %x, float %y, float %z) { +; CHECK-LABEL: call_nofpclass_intrinsic_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_sqrt_f32_e32 v0, v0 +; CHECK-NEXT: v_sqrt_f32_e32 v1, v1 +; CHECK-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) float @llvm.amdgcn.sqrt.f32(float %x) + %call1 = call nofpclass(nan) float @llvm.amdgcn.sqrt.f32(float %y) + %lt = fcmp olt float %call0, %call1 + %min = select nsz i1 %lt, float %call0, float %call1 + ret float %min +} + +define <2 x half> @call_nofpclass_intrinsic_v2f16(float %x, float %y, float %z, float %w) { +; CHECK-LABEL: call_nofpclass_intrinsic_v2f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; CHECK-NEXT: v_cvt_pkrtz_f16_f32 v1, v2, v3 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; CHECK-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; CHECK-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; CHECK-NEXT: s_mov_b32 s4, 0x5040100 +; CHECK-NEXT: v_perm_b32 v0, v1, v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %call0 = call nofpclass(nan) <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) + %call1 = call nofpclass(nan) <2 x half> @llvm.amdgcn.cvt.pkrtz(float %z, float %w) + %lt = fcmp olt <2 x half> %call0, %call1 + %min = select nsz <2 x i1> %lt, <2 x half> %call0, <2 x half> %call1 + ret <2 x half> %min +} diff --git a/llvm/test/CodeGen/PowerPC/milicode32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll index 78d036202fe4e..ddadd01a748f1 100644 --- a/llvm/test/CodeGen/PowerPC/milicode32.ll +++ b/llvm/test/CodeGen/PowerPC/milicode32.ll @@ -69,3 +69,59 @@ entry: } declare i32 @strlen(ptr noundef) nounwind + +define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i32 noundef %num) #0 { +; CHECK-AIX-32-P9-LABEL: test_memmove: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mflr r0 +; CHECK-AIX-32-P9-NEXT: stwu r1, -80(r1) +; CHECK-AIX-32-P9-NEXT: stw r0, 88(r1) +; CHECK-AIX-32-P9-NEXT: stw r31, 76(r1) # 4-byte Folded Spill +; CHECK-AIX-32-P9-NEXT: mr r31, r3 +; CHECK-AIX-32-P9-NEXT: stw r3, 72(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, 68(r1) +; CHECK-AIX-32-P9-NEXT: stw r5, 64(r1) +; CHECK-AIX-32-P9-NEXT: bl .___memmove[PR] +; CHECK-AIX-32-P9-NEXT: nop +; CHECK-AIX-32-P9-NEXT: mr r3, r31 +; CHECK-AIX-32-P9-NEXT: lwz r31, 76(r1) # 4-byte Folded Reload +; CHECK-AIX-32-P9-NEXT: addi r1, r1, 80 +; CHECK-AIX-32-P9-NEXT: lwz r0, 8(r1) +; CHECK-AIX-32-P9-NEXT: mtlr r0 +; CHECK-AIX-32-P9-NEXT: blr +; +; CHECK-LINUX32-P9-LABEL: test_memmove: +; CHECK-LINUX32-P9: # %bb.0: # %entry +; CHECK-LINUX32-P9-NEXT: mflr r0 +; CHECK-LINUX32-P9-NEXT: stwu r1, -32(r1) +; CHECK-LINUX32-P9-NEXT: stw r0, 36(r1) +; CHECK-LINUX32-P9-NEXT: .cfi_def_cfa_offset 32 +; CHECK-LINUX32-P9-NEXT: .cfi_offset lr, 4 +; CHECK-LINUX32-P9-NEXT: .cfi_offset r30, -8 +; CHECK-LINUX32-P9-NEXT: stw r30, 24(r1) # 4-byte Folded Spill +; CHECK-LINUX32-P9-NEXT: mr r30, r3 +; CHECK-LINUX32-P9-NEXT: stw r3, 20(r1) +; CHECK-LINUX32-P9-NEXT: stw r4, 16(r1) +; CHECK-LINUX32-P9-NEXT: stw r5, 12(r1) +; CHECK-LINUX32-P9-NEXT: bl memmove +; CHECK-LINUX32-P9-NEXT: mr r3, r30 +; CHECK-LINUX32-P9-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload +; CHECK-LINUX32-P9-NEXT: lwz r0, 36(r1) +; CHECK-LINUX32-P9-NEXT: addi r1, r1, 32 +; CHECK-LINUX32-P9-NEXT: mtlr r0 +; CHECK-LINUX32-P9-NEXT: blr +entry: + %destination.addr = alloca ptr, align 4 + %source.addr = alloca ptr, align 4 + %num.addr = alloca i32, align 4 + store ptr %destination, ptr %destination.addr, align 4 + store ptr %source, ptr %source.addr, align 4 + store i32 %num, ptr %num.addr, align 4 + %0 = load ptr, ptr %destination.addr, align 4 + %1 = load ptr, ptr %source.addr, align 4 + %2 = load i32, ptr %num.addr, align 4 + call void @llvm.memmove.p0.p0.i32(ptr align 1 %0, ptr align 1 %1, i32 %2, i1 false) + ret ptr %0 +} + +declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg) diff --git a/llvm/test/CodeGen/PowerPC/milicode64.ll b/llvm/test/CodeGen/PowerPC/milicode64.ll index 8b87529d9a6d8..f7814a424e0b9 100644 --- a/llvm/test/CodeGen/PowerPC/milicode64.ll +++ b/llvm/test/CodeGen/PowerPC/milicode64.ll @@ -100,3 +100,82 @@ entry: } declare i64 @strlen(ptr noundef) nounwind + +define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i64 noundef %num) #0 { +; CHECK-LE-P9-LABEL: test_memmove: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mflr r0 +; CHECK-LE-P9-NEXT: .cfi_def_cfa_offset 80 +; CHECK-LE-P9-NEXT: .cfi_offset lr, 16 +; CHECK-LE-P9-NEXT: .cfi_offset r30, -16 +; CHECK-LE-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-LE-P9-NEXT: stdu r1, -80(r1) +; CHECK-LE-P9-NEXT: std r0, 96(r1) +; CHECK-LE-P9-NEXT: mr r30, r3 +; CHECK-LE-P9-NEXT: std r3, 56(r1) +; CHECK-LE-P9-NEXT: std r4, 48(r1) +; CHECK-LE-P9-NEXT: std r5, 40(r1) +; CHECK-LE-P9-NEXT: bl memmove +; CHECK-LE-P9-NEXT: nop +; CHECK-LE-P9-NEXT: mr r3, r30 +; CHECK-LE-P9-NEXT: addi r1, r1, 80 +; CHECK-LE-P9-NEXT: ld r0, 16(r1) +; CHECK-LE-P9-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-LE-P9-NEXT: mtlr r0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_memmove: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mflr r0 +; CHECK-BE-P9-NEXT: stdu r1, -160(r1) +; CHECK-BE-P9-NEXT: std r0, 176(r1) +; CHECK-BE-P9-NEXT: .cfi_def_cfa_offset 160 +; CHECK-BE-P9-NEXT: .cfi_offset lr, 16 +; CHECK-BE-P9-NEXT: .cfi_offset r30, -16 +; CHECK-BE-P9-NEXT: std r30, 144(r1) # 8-byte Folded Spill +; CHECK-BE-P9-NEXT: mr r30, r3 +; CHECK-BE-P9-NEXT: std r3, 136(r1) +; CHECK-BE-P9-NEXT: std r4, 128(r1) +; CHECK-BE-P9-NEXT: std r5, 120(r1) +; CHECK-BE-P9-NEXT: bl memmove +; CHECK-BE-P9-NEXT: nop +; CHECK-BE-P9-NEXT: mr r3, r30 +; CHECK-BE-P9-NEXT: ld r30, 144(r1) # 8-byte Folded Reload +; CHECK-BE-P9-NEXT: addi r1, r1, 160 +; CHECK-BE-P9-NEXT: ld r0, 16(r1) +; CHECK-BE-P9-NEXT: mtlr r0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_memmove: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mflr r0 +; CHECK-AIX-64-P9-NEXT: stdu r1, -144(r1) +; CHECK-AIX-64-P9-NEXT: std r0, 160(r1) +; CHECK-AIX-64-P9-NEXT: std r31, 136(r1) # 8-byte Folded Spill +; CHECK-AIX-64-P9-NEXT: mr r31, r3 +; CHECK-AIX-64-P9-NEXT: std r3, 128(r1) +; CHECK-AIX-64-P9-NEXT: std r4, 120(r1) +; CHECK-AIX-64-P9-NEXT: std r5, 112(r1) +; CHECK-AIX-64-P9-NEXT: bl .memmove[PR] +; CHECK-AIX-64-P9-NEXT: nop +; CHECK-AIX-64-P9-NEXT: mr r3, r31 +; CHECK-AIX-64-P9-NEXT: ld r31, 136(r1) # 8-byte Folded Reload +; CHECK-AIX-64-P9-NEXT: addi r1, r1, 144 +; CHECK-AIX-64-P9-NEXT: ld r0, 16(r1) +; CHECK-AIX-64-P9-NEXT: mtlr r0 +; CHECK-AIX-64-P9-NEXT: blr +entry: + %destination.addr = alloca ptr, align 8 + %source.addr = alloca ptr, align 8 + %num.addr = alloca i64, align 8 + store ptr %destination, ptr %destination.addr, align 8 + store ptr %source, ptr %source.addr, align 8 + store i64 %num, ptr %num.addr, align 8 + %0 = load ptr, ptr %destination.addr, align 8 + %1 = load ptr, ptr %source.addr, align 8 + %2 = load i64, ptr %num.addr, align 8 + call void @llvm.memmove.p0.p0.i64(ptr align 1 %0, ptr align 1 %1, i64 %2, i1 false) + ret ptr %0 +} + +declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg) diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index ba6769b2aa3e1..0306bb18c2aed 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -232,7 +232,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ret i64 %Q } -; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic. +; Ensure CGP doesn't sink the compare before we have a chance to form the overflow intrinsic. define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp { ; RV32-LABEL: uaddo4: @@ -1076,41 +1076,37 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s5, a5 -; RV32-NEXT: mv s3, a1 +; RV32-NEXT: mv s1, a5 +; RV32-NEXT: mv s4, a1 ; RV32-NEXT: andi a1, a5, 1 -; RV32-NEXT: beqz a1, .LBB32_8 +; RV32-NEXT: beqz a1, .LBB32_6 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: mv s0, a4 -; RV32-NEXT: mv s2, a3 -; RV32-NEXT: mv s1, a2 -; RV32-NEXT: mv s4, a0 -; RV32-NEXT: beq s3, a3, .LBB32_3 +; RV32-NEXT: mv s3, a3 +; RV32-NEXT: mv s2, a2 +; RV32-NEXT: mv s5, a0 +; RV32-NEXT: beq s4, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t -; RV32-NEXT: sltu s6, s3, s2 +; RV32-NEXT: sltu s6, s4, s3 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s4, s1 +; RV32-NEXT: sltu s6, s5, s2 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call -; RV32-NEXT: beqz s6, .LBB32_8 +; RV32-NEXT: beqz s6, .LBB32_6 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s4, s1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beq s3, s2, .LBB32_7 -; RV32-NEXT: # %bb.6: # %end -; RV32-NEXT: sltu a0, s3, s2 -; RV32-NEXT: .LBB32_7: # %end -; RV32-NEXT: sub a2, s3, s2 -; RV32-NEXT: sub a3, s4, s1 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sw a3, 0(s0) -; RV32-NEXT: sw a2, 4(s0) -; RV32-NEXT: j .LBB32_9 -; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s5 -; RV32-NEXT: .LBB32_9: # %f +; RV32-NEXT: sltu a0, s5, s2 +; RV32-NEXT: sub a1, s4, s3 +; RV32-NEXT: sub a2, s5, s2 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sw a2, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: mv a0, s6 +; RV32-NEXT: j .LBB32_7 +; RV32-NEXT: .LBB32_6: # %f +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: .LBB32_7: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll new file mode 100644 index 0000000000000..1d3ba2a38e55b --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll @@ -0,0 +1,11 @@ +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: %5:vfid(<2 x s64>) = nnan ninf nsz arcp afn reassoc G_INTRINSIC intrinsic(@llvm.spv.unpackhalf2x16), %0:iid(s64) is only supported with the GLSL extended instruction set. + +define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 { + %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0) + %3 = extractelement <2 x float> %2, i64 0 + ret float %3 +} + diff --git a/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll b/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll index 260394b658348..fb550bb01a3a2 100644 --- a/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll +++ b/llvm/test/CodeGen/SPIRV/spirv_param_decorations_quals.ll @@ -7,9 +7,11 @@ entry: ; CHECK-SPIRV: OpDecorate %[[#PId:]] Volatile ; CHECK-SPIRV: OpDecorate %[[#PId]] FuncParamAttr NoAlias +; CHECK-SPIRV: OpDecorate %[[#PId]] FuncParamAttr NoWrite ; CHECK-SPIRV: %[[#PId]] = OpFunctionParameter %[[#]] !7 = !{!"volatile"} !8 = !{i32 38, i32 4} ; FuncParamAttr NoAlias -!9 = !{!8} +!11 = !{i32 38, i32 6} ; FuncParamAttr NoWrite +!9 = !{!8, !11} !10 = !{!9} diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll index 1962ddebc2115..f2b4c49b1dbcd 100644 --- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll +++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll @@ -36,10 +36,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: Ltmp0: +; CHECK-NEXT: Ltmp0: ## EH_LABEL ; CHECK-NEXT: ## implicit-def: $ebx ; CHECK-NEXT: calll __Znam -; CHECK-NEXT: Ltmp1: +; CHECK-NEXT: Ltmp1: ## EH_LABEL ; CHECK-NEXT: ## %bb.1: ## %bb11 ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movb $1, %al @@ -58,13 +58,13 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: jne LBB0_9 ; CHECK-NEXT: ## %bb.10: ## %bb41 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 -; CHECK-NEXT: Ltmp2: +; CHECK-NEXT: Ltmp2: ## EH_LABEL ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %esi, (%esp) ; CHECK-NEXT: calll _Pjii -; CHECK-NEXT: Ltmp3: +; CHECK-NEXT: Ltmp3: ## EH_LABEL ; CHECK-NEXT: ## %bb.11: ## %bb42 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: xorl %eax, %eax @@ -126,20 +126,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: jmp LBB0_8 ; CHECK-NEXT: LBB0_18: ## %bb43 -; CHECK-NEXT: Ltmp5: +; CHECK-NEXT: Ltmp5: ## EH_LABEL ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp6: +; CHECK-NEXT: Ltmp6: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_3 ; CHECK-NEXT: LBB0_2: ## %bb29 -; CHECK-NEXT: Ltmp7: +; CHECK-NEXT: Ltmp7: ## EH_LABEL ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp8: +; CHECK-NEXT: Ltmp8: ## EH_LABEL ; CHECK-NEXT: LBB0_3: ## %bb30 ; CHECK-NEXT: ud2 ; CHECK-NEXT: LBB0_4: ## %bb20.loopexit -; CHECK-NEXT: Ltmp4: +; CHECK-NEXT: Ltmp4: ## EH_LABEL ; CHECK-NEXT: LBB0_9: ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: LBB0_6: ## %bb23 @@ -151,7 +151,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl ; CHECK-NEXT: LBB0_5: ## %bb20.loopexit.split-lp -; CHECK-NEXT: Ltmp9: +; CHECK-NEXT: Ltmp9: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_6 ; CHECK-NEXT: Lfunc_end0: bb: diff --git a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir index 348a2901ff6a4..24453066f2583 100644 --- a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir +++ b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir @@ -55,7 +55,7 @@ !9 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !10) !10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned) !11 = !DILocation(line: 4, column: 1, scope: !5) - !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) + !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7) ... --- diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll index aef44cc3e40d0..162a0c93bfcf4 100644 --- a/llvm/test/CodeGen/X86/pr166534.ll +++ b/llvm/test/CodeGen/X86/pr166534.ll @@ -7,100 +7,64 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) { ; SSE2-LABEL: pr166534: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %r8 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movq (%rsi), %r9 -; SSE2-NEXT: movq 8(%rsi), %rdi ; SSE2-NEXT: movdqu (%rsi), %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %esi -; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF -; SSE2-NEXT: sete %r10b -; SSE2-NEXT: orq %r10, (%rdx) +; SSE2-NEXT: sete %al +; SSE2-NEXT: orq %rax, (%rdx) ; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF ; SSE2-NEXT: jne .LBB0_2 ; SSE2-NEXT: # %bb.1: # %if.then -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %rdi, %r8 -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: orq %rax, %r8 -; SSE2-NEXT: sete %dl -; SSE2-NEXT: orq %rdx, (%rcx) +; SSE2-NEXT: orq %rax, (%rcx) ; SSE2-NEXT: .LBB0_2: # %if.end ; SSE2-NEXT: retq ; ; SSE4-LABEL: pr166534: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movq (%rdi), %rax -; SSE4-NEXT: movq 8(%rdi), %r8 ; SSE4-NEXT: movdqu (%rdi), %xmm0 -; SSE4-NEXT: movq (%rsi), %r9 -; SSE4-NEXT: movq 8(%rsi), %rdi ; SSE4-NEXT: movdqu (%rsi), %xmm1 ; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: xorl %esi, %esi +; SSE4-NEXT: xorl %eax, %eax ; SSE4-NEXT: ptest %xmm1, %xmm1 -; SSE4-NEXT: sete %sil -; SSE4-NEXT: orq %rsi, (%rdx) +; SSE4-NEXT: sete %al +; SSE4-NEXT: orq %rax, (%rdx) ; SSE4-NEXT: ptest %xmm1, %xmm1 ; SSE4-NEXT: jne .LBB0_2 ; SSE4-NEXT: # %bb.1: # %if.then -; SSE4-NEXT: xorq %r9, %rax -; SSE4-NEXT: xorq %rdi, %r8 -; SSE4-NEXT: xorl %edx, %edx -; SSE4-NEXT: orq %rax, %r8 -; SSE4-NEXT: sete %dl -; SSE4-NEXT: orq %rdx, (%rcx) +; SSE4-NEXT: orq %rax, (%rcx) ; SSE4-NEXT: .LBB0_2: # %if.end ; SSE4-NEXT: retq ; ; AVX2-LABEL: pr166534: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %r8 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: movq (%rsi), %rdi ; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: movq 8(%rsi), %rsi -; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: vptest %xmm0, %xmm0 -; AVX2-NEXT: sete %r9b -; AVX2-NEXT: orq %r9, (%rdx) +; AVX2-NEXT: sete %al +; AVX2-NEXT: orq %rax, (%rdx) ; AVX2-NEXT: vptest %xmm0, %xmm0 ; AVX2-NEXT: jne .LBB0_2 ; AVX2-NEXT: # %bb.1: # %if.then -; AVX2-NEXT: xorq %rdi, %rax -; AVX2-NEXT: xorq %rsi, %r8 -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: sete %dl -; AVX2-NEXT: orq %rdx, (%rcx) +; AVX2-NEXT: orq %rax, (%rcx) ; AVX2-NEXT: .LBB0_2: # %if.end ; AVX2-NEXT: retq ; ; AVX512-LABEL: pr166534: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %r8 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: movq (%rsi), %r9 -; AVX512-NEXT: movq 8(%rsi), %rdi ; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: xorl %eax, %eax ; AVX512-NEXT: vptest %xmm0, %xmm0 -; AVX512-NEXT: sete %sil -; AVX512-NEXT: orq %rsi, (%rdx) +; AVX512-NEXT: sete %al +; AVX512-NEXT: orq %rax, (%rdx) ; AVX512-NEXT: vptest %xmm0, %xmm0 ; AVX512-NEXT: jne .LBB0_2 ; AVX512-NEXT: # %bb.1: # %if.then -; AVX512-NEXT: xorq %r9, %rax -; AVX512-NEXT: xorq %rdi, %r8 -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: sete %dl -; AVX512-NEXT: orq %rdx, (%rcx) +; AVX512-NEXT: orq %rax, (%rcx) ; AVX512-NEXT: .LBB0_2: # %if.end ; AVX512-NEXT: retq entry: diff --git a/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-diop-diexpression-address-spaces.ll b/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-diop-diexpression-address-spaces.ll index 60a055ad66b61..21e0c572dec9b 100644 --- a/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-diop-diexpression-address-spaces.ll +++ b/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-diop-diexpression-address-spaces.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -mcpu=gfx1030 -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-dwarfdump --debug-info - | FileCheck %s - +; XFAIL: * ; CHECK-LABEL: DW_AT_name ("test_loc_single") define void @test_loc_single(ptr addrspace(3) %ptr) #0 !dbg !9 { ; Verify that the right address class attribute is attached to the variable's diff --git a/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir b/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir index a334e99b9cade..ea01835cae1e5 100644 --- a/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir +++ b/llvm/test/DebugInfo/MIR/X86/clobbered-fragments.mir @@ -85,10 +85,11 @@ !15 = !DISubrange(count: 3) !16 = !DILocation(line: 8, scope: !8) !17 = !DILocation(line: 9, scope: !8) - !18 = distinct !DISubprogram(name: "test2", scope: !2, file: !2, line: 7, type: !9, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !11) + !18 = distinct !DISubprogram(name: "test2", scope: !2, file: !2, line: 7, type: !9, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !22) !19 = !DILocalVariable(name: "local", scope: !18, file: !2, line: 8, type: !13) !20 = !DILocation(line: 15, scope: !18) !21 = !DILocation(line: 16, scope: !18) + !22 = !{!19} ... --- diff --git a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir index c38c0a1a79f75..63dc44fb705fe 100644 --- a/llvm/test/DebugInfo/MIR/X86/machine-cse.mir +++ b/llvm/test/DebugInfo/MIR/X86/machine-cse.mir @@ -73,13 +73,14 @@ !0 = !{i32 2, !"Debug Info Version", i32 3} !1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "beards", isOptimized: true, runtimeVersion: 4, emissionKind: FullDebug) !2 = !DIFile(filename: "bees.cpp", directory: "") - !3 = distinct !DISubprogram(name: "nope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !8) - !33 = distinct !DISubprogram(name: "alsonope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !8) + !3 = distinct !DISubprogram(name: "nope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !9) + !33 = distinct !DISubprogram(name: "alsonope", scope: !1, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !9) !4 = !DILocalVariable(name: "bees", scope: !3, type: !5) !5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64) !6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !7 = !DILocation(line: 0, scope: !3) !8 = !{!4} + !9 = !{} ; CHECK: ![[METAVAR:[0-9]+]] = !DILocalVariable(name: "bees", diff --git a/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir b/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir index 06ce18d8edaa7..28fc044e606b5 100644 --- a/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir +++ b/llvm/test/DebugInfo/MIR/X86/remove-redundant-dbg-vals.mir @@ -139,15 +139,15 @@ !23 = !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !24, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2) !24 = !DISubroutineType(types: !25) !25 = !{null, !11} - !26 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !26 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !27 = !DILocation(line: 0, scope: !26) - !28 = distinct !DISubprogram(name: "foo3", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !28 = distinct !DISubprogram(name: "foo3", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !29 = !DILocation(line: 0, scope: !28) - !30 = distinct !DISubprogram(name: "foo4", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !30 = distinct !DISubprogram(name: "foo4", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !31 = !DILocation(line: 0, scope: !30) - !32 = distinct !DISubprogram(name: "foo5", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !32 = distinct !DISubprogram(name: "foo5", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !33 = !DILocation(line: 0, scope: !32) - !34 = distinct !DISubprogram(name: "foo6", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12) + !34 = distinct !DISubprogram(name: "foo6", scope: !1, file: !1, line: 4, type: !9, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !35 = !DILocation(line: 0, scope: !34) ... diff --git a/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll b/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll index dbbef2b39587d..594607c6e95d8 100644 --- a/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll +++ b/llvm/test/DebugInfo/X86/instr-ref-selectiondag.ll @@ -281,15 +281,19 @@ lala: !11 = !{!13} !13 = !DILocalVariable(name: "baz", scope: !7, file: !1, line: 6, type: !10) !14 = !DILocation(line: 1, scope: !7) -!20 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!20 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !23) !21 = !DILocalVariable(name: "xyzzy", scope: !20, file: !1, line: 6, type: !10) !22 = !DILocation(line: 1, scope: !20) -!30 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!23 = !{!21} +!30 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !33) !31 = !DILocalVariable(name: "xyzzy", scope: !30, file: !1, line: 6, type: !10) !32 = !DILocation(line: 1, scope: !30) -!40 = distinct !DISubprogram(name: "qux", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!33 = !{!31} +!40 = distinct !DISubprogram(name: "qux", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !46) !41 = !DILocalVariable(name: "socks", scope: !40, file: !1, line: 6, type: !10) !42 = !DILocation(line: 1, scope: !40) -!43 = distinct !DISubprogram(name: "inlined", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!43 = distinct !DISubprogram(name: "inlined", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !47) !44 = !DILocation(line: 0, scope: !43, inlinedAt: !42) !45 = !DILocalVariable(name: "knees", scope: !43, file: !1, line: 6, type: !10) +!46 = !{!41} +!47 = !{!45} diff --git a/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir b/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir index 8a0537658c9c0..2900f0bdcf864 100644 --- a/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir +++ b/llvm/test/DebugInfo/X86/live-debug-values-constprop.mir @@ -82,15 +82,18 @@ !14 = !DISubroutineType(types: !15) !15 = !{!16} !16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) - !40 = distinct !DISubprogram(name: "bar", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true) + !40 = distinct !DISubprogram(name: "bar", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !43, type: !14, isDefinition: true) !41 = !DILocalVariable(name: "towel", scope: !40, file: !2, line: 1, type: !16) !42 = !DILocation(line: 40, scope: !40) - !80 = distinct !DISubprogram(name: "baz", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true) + !43 = !{!41} + !80 = distinct !DISubprogram(name: "baz", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !83, type: !14, isDefinition: true) !81 = !DILocalVariable(name: "socks", scope: !80, file: !2, line: 1, type: !16) !82 = !DILocation(line: 40, scope: !80) - !120 = distinct !DISubprogram(name: "qux", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !13, type: !14, isDefinition: true) + !83 = !{!81} + !120 = distinct !DISubprogram(name: "qux", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !123, type: !14, isDefinition: true) !121 = !DILocalVariable(name: "shoes", scope: !120, file: !2, line: 1, type: !16) !122 = !DILocation(line: 40, scope: !120) + !123 = !{!121} ... --- diff --git a/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll b/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll index e656c6237c068..145b5045687cf 100644 --- a/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll +++ b/llvm/test/DebugInfo/X86/live-debug-values-remove-range.ll @@ -108,6 +108,6 @@ exit: !106 = !DILocation(line: 1, scope: !104) !113 = !{!103} !203 = !DILocalVariable(name: "teacake", scope: !204, file: !2, line: 1, type: !16) -!204 = distinct !DISubprogram(name: "toad", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !113, type: !14, isDefinition: true) +!204 = distinct !DISubprogram(name: "toad", scope: !2, file: !2, line: 1, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !213, type: !14, isDefinition: true) !206 = !DILocation(line: 1, scope: !204) !213 = !{!203} diff --git a/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir b/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir index 3beaf8996e4f0..ab57a9612702f 100644 --- a/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir +++ b/llvm/test/DebugInfo/X86/live-debug-vars-intervals.mir @@ -91,7 +91,7 @@ !10 = !{!11} !11 = !DILocalVariable(name: "x", arg: 1, scope: !6, file: !1, line: 3, type: !9) !12 = !DILocation(line: 3, column: 12, scope: !6) - !13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !10) + !13 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !{!14}) !14 = !DILocalVariable(name: "x", arg: 1, scope: !13, file: !1, line: 21, type: !9) !15 = !DILocation(line: 23, column: 12, scope: !13) diff --git a/llvm/test/Linker/thinlto_funcimport_debug.ll b/llvm/test/Linker/thinlto_funcimport_debug.ll index 294b3a773ef51..4454a56c40ef7 100644 --- a/llvm/test/Linker/thinlto_funcimport_debug.ll +++ b/llvm/test/Linker/thinlto_funcimport_debug.ll @@ -80,8 +80,8 @@ attributes #1 = { nounwind readnone } !26 = !DILocation(line: 9, column: 3, scope: !4) !27 = distinct !DISubprogram(name: "func3", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !28) !28 = !{!29} -!29 = !DILocalVariable(name: "n", arg: 1, scope: !30, file: !1, line: 8, type: !7) +!29 = !DILocalVariable(name: "n", arg: 1, scope: !27, file: !1, line: 8, type: !33) !30 = distinct !DISubprogram(name: "func4", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !31) !31 = !{!32} !32 = !DILocalVariable(name: "n", arg: 1, scope: !30, file: !1, line: 8, type: !7) - +!33 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", scope: !30, file: !1, line: 13, baseType: !7) diff --git a/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll b/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll index eb2fb4f4774d8..ab01bbf20de71 100644 --- a/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll +++ b/llvm/test/Transforms/CodeExtractor/PartialInlineDebug.ll @@ -96,11 +96,11 @@ entry: !13 = !DILocalVariable(name: "v", arg: 1, scope: !8, file: !1, line: 3, type: !11) !14 = !DILocation(line: 5, column: 10, scope: !8) !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 9, column: 7) -!16 = distinct !DISubprogram(name: "callee", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!16 = distinct !DISubprogram(name: "callee", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !17 = !DILocation(line: 10, column: 7, scope: !15) -!18 = distinct !DISubprogram(name: "callee2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!18 = distinct !DISubprogram(name: "callee2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !19 = distinct !DILexicalBlock(scope: !18, file: !1, line: 100, column: 1) !20 = !DILocation(line: 110, column: 17, scope: !19) -!21 = distinct !DISubprogram(name: "caller2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!21 = distinct !DISubprogram(name: "caller2", scope: !1, file: !1, line: 8, type: !9, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !22 = !DILocation(line: 110, column: 17, scope: !21) !23 = !DILocation(line: 15, column: 7, scope: !15) diff --git a/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll b/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll index da6c19d604c7c..76406ddea6b9f 100644 --- a/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll +++ b/llvm/test/Transforms/HotColdSplit/split-out-dbg-label.ll @@ -66,7 +66,7 @@ define void @inline_me() !dbg !13 { !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) !11 = !DILocation(line: 1, column: 1, scope: !6) !12 = !DILabel(scope: !6, name: "bye", file: !1, line: 28) -!13 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8) +!13 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2) !14 = !DILabel(scope: !13, name: "label_in_@inline_me", file: !1, line: 29) !15 = !DILocation(line: 2, column: 2, scope: !13, inlinedAt: !11) !16 = !DILabel(scope: !17, name: "scoped_label_in_foo", file: !1, line: 30) diff --git a/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll b/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll index 3f69f0c200dad..f9dd9eaf01422 100644 --- a/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll +++ b/llvm/test/Transforms/HotColdSplit/transfer-debug-info.ll @@ -106,7 +106,7 @@ define void @inline_me() !dbg !12{ !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10) !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) !11 = !DILocation(line: 1, column: 1, scope: !6) -!12 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8) +!12 = distinct !DISubprogram(name: "inline_me", linkageName: "inline_me", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2) !13 = !DILocation(line: 2, column: 2, scope: !12, inlinedAt: !14) !14 = !DILocation(line: 3, column: 3, scope: !15) !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 4, column: 4) diff --git a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll index c1d7c30e936f2..ec90779d0acce 100644 --- a/llvm/test/Transforms/InstCombine/debuginfo-dce.ll +++ b/llvm/test/Transforms/InstCombine/debuginfo-dce.ll @@ -125,15 +125,15 @@ attributes #1 = { nounwind readnone } !19 = !DILocation(line: 6, column: 17, scope: !14) !20 = !DIExpression(DW_OP_plus_uconst, 0) !21 = !DILocation(line: 11, column: 1, scope: !14) -!22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!22 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !23 = !DILocation(line: 6, column: 17, scope: !22) !24 = !DILocalVariable(name: "entry", scope: !22, file: !1, line: 6, type: !4) -!25 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!25 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !26 = !DILocation(line: 6, column: 17, scope: !25) !27 = !DILocalVariable(name: "entry", scope: !25, file: !1, line: 6, type: !4) -!28 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!28 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !29 = !DILocation(line: 6, column: 17, scope: !28) !30 = !DILocalVariable(name: "entry", scope: !28, file: !1, line: 6, type: !4) -!31 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!31 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !32 = !DILocation(line: 6, column: 17, scope: !31) !33 = !DILocalVariable(name: "entry", scope: !31, file: !1, line: 6, type: !4) diff --git a/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll b/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll index 437e56665d53b..fa8357505e7e9 100644 --- a/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll +++ b/llvm/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll @@ -131,7 +131,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0) !11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !12 = !DILocation(line: 0, scope: !10) -!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!13 = distinct !DISubprogram(name: "multi_exit", scope: !1, file: !1, line: 10, type: !7, scopeLine: 10, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17) !14 = !DILocation(line: 0, scope: !15) !15 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 0) !16 = !DILocalVariable(name: "sum2", scope: !15, file: !1, line: 11, type: !11) +!17 = !{!16} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index f16351720b20f..8d878f47d1ece 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -613,63 +613,17 @@ exit: define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store( ; COMMON-SAME: ptr [[DST:%.*]]) { -; COMMON-NEXT: [[ENTRY:.*:]] -; COMMON-NEXT: br label %[[VECTOR_PH:.*]] -; COMMON: [[VECTOR_PH]]: -; COMMON-NEXT: br label %[[VECTOR_BODY:.*]] -; COMMON: [[VECTOR_BODY]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; COMMON: [[PRED_STORE_IF]]: -; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0 -; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]] -; COMMON: [[PRED_STORE_CONTINUE]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; COMMON: [[PRED_STORE_IF1]]: -; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1 -; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; COMMON: [[PRED_STORE_CONTINUE2]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; COMMON: [[PRED_STORE_IF3]]: -; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2 -; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; COMMON: [[PRED_STORE_CONTINUE4]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] -; COMMON: [[PRED_STORE_IF5]]: -; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3 -; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; COMMON: [[PRED_STORE_CONTINUE6]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] -; COMMON: [[PRED_STORE_IF7]]: -; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4 -; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]] -; COMMON: [[PRED_STORE_CONTINUE8]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] -; COMMON: [[PRED_STORE_IF9]]: -; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5 -; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]] -; COMMON: [[PRED_STORE_CONTINUE10]]: -; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] -; COMMON: [[PRED_STORE_IF11]]: -; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6 -; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 -; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] -; COMMON: [[PRED_STORE_CONTINUE12]]: -; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]] -; COMMON: [[PRED_STORE_IF13]]: -; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 -; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 -; COMMON-NEXT: br label %[[EXIT]] +; COMMON-NEXT: [[ENTRY:.*]]: +; COMMON-NEXT: br label %[[LOOP:.*]] +; COMMON: [[LOOP]]: +; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8 +; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] +; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1 +; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7 +; COMMON-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] ; COMMON: [[EXIT]]: -; COMMON-NEXT: br label %[[SCALAR_PH:.*]] -; COMMON: [[SCALAR_PH]]: -; COMMON-NEXT: br label %[[EXIT1:.*]] -; COMMON: [[EXIT1]]: ; COMMON-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index cfc6cc87a2a21..4b097ba2422e4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -271,69 +271,11 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @iv_trunc( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[MUL_X:%.*]] = add i32 [[X]], 1 -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: -; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]] -; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[MUL_X]], 0 -; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[MUL_X]] -; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0 -; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false -; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[MUL_X]], 0 -; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]] -; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] -; PRED-NEXT: br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 2 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 2 -; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL_X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE2]] ] -; PRED-NEXT: [[TMP16:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <2 x i32> [[TMP16]] to <2 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[TMP17]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: store i32 1, ptr [[TMP20]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP21:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP21]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_IF1]]: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <2 x i64> [[TMP17]], i32 1 -; PRED-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP22]] -; PRED-NEXT: store i32 1, ptr [[TMP23]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP25:%.*]] = xor i1 [[TMP24]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2) -; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[FOR_BODY:.*]] ; PRED: [[FOR_BODY]]: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] ; PRED-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32 ; PRED-NEXT: [[ADD_I:%.*]] = mul i32 [[MUL_X]], [[TRUNC_IV]] ; PRED-NEXT: [[IV_MUL:%.*]] = zext i32 [[ADD_I]] to i64 @@ -341,7 +283,7 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 1, ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] -; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[FOR_BODY]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -437,101 +379,21 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @trunc_ivs_and_store( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] -; PRED-NEXT: [[MUL:%.*]] = mul i32 [[X]], [[X]] -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[TMP1:%.*]] = mul i32 [[X]], [[X]] -; PRED-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP1]] -; PRED-NEXT: [[TMP3:%.*]] = icmp slt i32 [[MUL]], 0 -; PRED-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 [[MUL]] -; PRED-NEXT: [[TMP5:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1 -; PRED-NEXT: [[TMP6:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], 0 -; PRED-NEXT: [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false -; PRED-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP11:%.*]] = icmp ne i32 [[MUL]], 0 -; PRED-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]] -; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]] -; PRED-NEXT: br i1 [[TMP13]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ] -; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64> -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0 -; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]] -; PRED-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0 -; PRED-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]] -; PRED: [[PRED_STORE_IF2]]: -; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1 -; PRED-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]] -; PRED-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1 -; PRED-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE3]] -; PRED: [[PRED_STORE_CONTINUE3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] -; PRED: [[PRED_STORE_IF4]]: -; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2 -; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]] -; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE5]] -; PRED: [[PRED_STORE_CONTINUE5]]: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]] -; PRED: [[PRED_STORE_IF6]]: -; PRED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3 -; PRED-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]] -; PRED-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; PRED: [[PRED_STORE_CONTINUE7]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]]) -; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: -; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] -; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[IV_1_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32 -; PRED-NEXT: [[IV_1_MUL:%.*]] = mul i32 [[MUL]], [[IV_1_TRUNC]] +; PRED-NEXT: [[IV_1_MUL:%.*]] = mul i32 [[TMP1]], [[IV_1_TRUNC]] ; PRED-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 ; PRED-NEXT: [[MUL_EXT:%.*]] = zext i32 [[IV_1_MUL]] to i64 ; PRED-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[MUL_EXT]] ; PRED-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 ; PRED-NEXT: [[EXITCOND_3_NOT:%.*]] = icmp eq i64 [[IV_1]], [[N]] -; PRED-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; PRED-NEXT: br i1 [[EXITCOND_3_NOT]], label %[[EXIT:.*]], label %[[LOOP]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -627,91 +489,12 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; ; PRED-LABEL: define void @ivs_trunc_and_ext( ; PRED-SAME: i32 [[X:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; PRED-NEXT: [[ENTRY:.*:]] +; PRED-NEXT: [[ENTRY:.*]]: ; PRED-NEXT: [[ADD:%.*]] = add i32 [[X]], 1 -; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; PRED-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] -; PRED: [[VECTOR_SCEVCHECK]]: -; PRED-NEXT: [[TMP1:%.*]] = sub i32 -1, [[X]] -; PRED-NEXT: [[TMP2:%.*]] = icmp slt i32 [[ADD]], 0 -; PRED-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 [[TMP1]], i32 [[ADD]] -; PRED-NEXT: [[TMP4:%.*]] = trunc i64 [[N]] to i32 -; PRED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[TMP3]], i32 [[TMP4]]) -; PRED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 -; PRED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; PRED-NEXT: [[TMP5:%.*]] = sub i32 0, [[MUL_RESULT]] -; PRED-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], 0 -; PRED-NEXT: [[TMP7:%.*]] = select i1 [[TMP2]], i1 [[TMP6]], i1 false -; PRED-NEXT: [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW]] -; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], 4294967295 -; PRED-NEXT: [[TMP10:%.*]] = icmp ne i32 [[ADD]], 0 -; PRED-NEXT: [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP10]] -; PRED-NEXT: [[TMP12:%.*]] = or i1 [[TMP8]], [[TMP11]] -; PRED-NEXT: br i1 [[TMP12]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; PRED: [[VECTOR_PH]]: -; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4 -; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]]) -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[ADD]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; PRED-NEXT: br label %[[VECTOR_BODY:.*]] -; PRED: [[VECTOR_BODY]]: -; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ] -; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; PRED-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]] -; PRED-NEXT: [[TMP17:%.*]] = zext <4 x i32> [[TMP16]] to <4 x i64> -; PRED-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; PRED-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; PRED: [[PRED_STORE_IF]]: -; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 -; PRED-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP19]] -; PRED-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 0 -; PRED-NEXT: store i32 [[TMP21]], ptr [[TMP20]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]] -; PRED: [[PRED_STORE_CONTINUE]]: -; PRED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; PRED-NEXT: br i1 [[TMP22]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] -; PRED: [[PRED_STORE_IF1]]: -; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 -; PRED-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]] -; PRED-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1 -; PRED-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE2]] -; PRED: [[PRED_STORE_CONTINUE2]]: -; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 -; PRED-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] -; PRED: [[PRED_STORE_IF3]]: -; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 -; PRED-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]] -; PRED-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2 -; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE4]] -; PRED: [[PRED_STORE_CONTINUE4]]: -; PRED-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 -; PRED-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6]] -; PRED: [[PRED_STORE_IF5]]: -; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 -; PRED-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]] -; PRED-NEXT: [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3 -; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 -; PRED-NEXT: br label %[[PRED_STORE_CONTINUE6]] -; PRED: [[PRED_STORE_CONTINUE6]]: -; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]]) -; PRED-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 -; PRED-NEXT: [[TMP35:%.*]] = xor i1 [[TMP34]], true -; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br label %[[EXIT:.*]] -; PRED: [[SCALAR_PH]]: ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: -; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] -; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV_1]] to i32 ; PRED-NEXT: [[IV_MUL:%.*]] = mul i32 [[ADD]], [[IV_TRUNC]] ; PRED-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 1 @@ -720,7 +503,7 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: store i32 [[IV_2]], ptr [[GEP]], align 4 ; PRED-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 ; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]] -; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; PRED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -842,7 +625,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED: [[PRED_STORE_CONTINUE5]]: ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 ; PRED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; PRED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; PRED-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: @@ -855,7 +638,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; PRED-NEXT: [[IV_EXT]] = zext i32 [[IV_1_NEXT]] to i64 ; PRED-NEXT: [[C:%.*]] = icmp ult i64 [[IV_EXT]], [[N]] -; PRED-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP11:![0-9]+]] +; PRED-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll index 6ea075f76aed4..83be0708774f1 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll @@ -181,178 +181,23 @@ for.cond.cleanup: define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) { ; DEFAULT-LABEL: define void @tail_predicate_without_optsize( ; DEFAULT-SAME: ptr [[P:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[C:%.*]], i32 [[N:%.*]]) { -; DEFAULT-NEXT: [[ENTRY:.*:]] -; DEFAULT-NEXT: br label %[[VECTOR_PH:.*]] -; DEFAULT: [[VECTOR_PH]]: -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i8> poison, i8 [[B]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT3]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] -; DEFAULT: [[VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE35:.*]] ] -; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE35]] ] -; DEFAULT-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE35]] ] -; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14) -; DEFAULT-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]] -; DEFAULT-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1) -; DEFAULT-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]] -; DEFAULT-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP3]], [[TMP1]] -; DEFAULT-NEXT: [[TMP5:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 2) -; DEFAULT-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP5]], [[BROADCAST_SPLAT6]] -; DEFAULT-NEXT: [[TMP7:%.*]] = add <16 x i8> [[TMP4]], [[TMP6]] -; DEFAULT-NEXT: [[TMP8:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0 -; DEFAULT-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] -; DEFAULT: [[PRED_STORE_IF]]: -; DEFAULT-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 -; DEFAULT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP9]] -; DEFAULT-NEXT: [[TMP11:%.*]] = extractelement <16 x i8> [[TMP7]], i32 0 -; DEFAULT-NEXT: store i8 [[TMP11]], ptr [[TMP10]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE]] -; DEFAULT: [[PRED_STORE_CONTINUE]]: -; DEFAULT-NEXT: [[TMP12:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1 -; DEFAULT-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] -; DEFAULT: [[PRED_STORE_IF6]]: -; DEFAULT-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 1 -; DEFAULT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP13]] -; DEFAULT-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[TMP7]], i32 1 -; DEFAULT-NEXT: store i8 [[TMP15]], ptr [[TMP14]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE7]] -; DEFAULT: [[PRED_STORE_CONTINUE7]]: -; DEFAULT-NEXT: [[TMP16:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] -; DEFAULT: [[PRED_STORE_IF8]]: -; DEFAULT-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2 -; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP17]] -; DEFAULT-NEXT: [[TMP19:%.*]] = extractelement <16 x i8> [[TMP7]], i32 2 -; DEFAULT-NEXT: store i8 [[TMP19]], ptr [[TMP18]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE9]] -; DEFAULT: [[PRED_STORE_CONTINUE9]]: -; DEFAULT-NEXT: [[TMP20:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3 -; DEFAULT-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] -; DEFAULT: [[PRED_STORE_IF10]]: -; DEFAULT-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 3 -; DEFAULT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP21]] -; DEFAULT-NEXT: [[TMP23:%.*]] = extractelement <16 x i8> [[TMP7]], i32 3 -; DEFAULT-NEXT: store i8 [[TMP23]], ptr [[TMP22]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE11]] -; DEFAULT: [[PRED_STORE_CONTINUE11]]: -; DEFAULT-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4 -; DEFAULT-NEXT: br i1 [[TMP24]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13:.*]] -; DEFAULT: [[PRED_STORE_IF12]]: -; DEFAULT-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], 4 -; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP25]] -; DEFAULT-NEXT: [[TMP27:%.*]] = extractelement <16 x i8> [[TMP7]], i32 4 -; DEFAULT-NEXT: store i8 [[TMP27]], ptr [[TMP26]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE13]] -; DEFAULT: [[PRED_STORE_CONTINUE13]]: -; DEFAULT-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5 -; DEFAULT-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF14:.*]], label %[[PRED_STORE_CONTINUE15:.*]] -; DEFAULT: [[PRED_STORE_IF14]]: -; DEFAULT-NEXT: [[TMP29:%.*]] = add i64 [[INDEX]], 5 -; DEFAULT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP29]] -; DEFAULT-NEXT: [[TMP31:%.*]] = extractelement <16 x i8> [[TMP7]], i32 5 -; DEFAULT-NEXT: store i8 [[TMP31]], ptr [[TMP30]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE15]] -; DEFAULT: [[PRED_STORE_CONTINUE15]]: -; DEFAULT-NEXT: [[TMP32:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6 -; DEFAULT-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF16:.*]], label %[[PRED_STORE_CONTINUE17:.*]] -; DEFAULT: [[PRED_STORE_IF16]]: -; DEFAULT-NEXT: [[TMP33:%.*]] = add i64 [[INDEX]], 6 -; DEFAULT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP33]] -; DEFAULT-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[TMP7]], i32 6 -; DEFAULT-NEXT: store i8 [[TMP35]], ptr [[TMP34]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE17]] -; DEFAULT: [[PRED_STORE_CONTINUE17]]: -; DEFAULT-NEXT: [[TMP36:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7 -; DEFAULT-NEXT: br i1 [[TMP36]], label %[[PRED_STORE_IF18:.*]], label %[[PRED_STORE_CONTINUE19:.*]] -; DEFAULT: [[PRED_STORE_IF18]]: -; DEFAULT-NEXT: [[TMP37:%.*]] = add i64 [[INDEX]], 7 -; DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP37]] -; DEFAULT-NEXT: [[TMP39:%.*]] = extractelement <16 x i8> [[TMP7]], i32 7 -; DEFAULT-NEXT: store i8 [[TMP39]], ptr [[TMP38]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE19]] -; DEFAULT: [[PRED_STORE_CONTINUE19]]: -; DEFAULT-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8 -; DEFAULT-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF20:.*]], label %[[PRED_STORE_CONTINUE21:.*]] -; DEFAULT: [[PRED_STORE_IF20]]: -; DEFAULT-NEXT: [[TMP41:%.*]] = add i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP41]] -; DEFAULT-NEXT: [[TMP43:%.*]] = extractelement <16 x i8> [[TMP7]], i32 8 -; DEFAULT-NEXT: store i8 [[TMP43]], ptr [[TMP42]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE21]] -; DEFAULT: [[PRED_STORE_CONTINUE21]]: -; DEFAULT-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9 -; DEFAULT-NEXT: br i1 [[TMP44]], label %[[PRED_STORE_IF22:.*]], label %[[PRED_STORE_CONTINUE23:.*]] -; DEFAULT: [[PRED_STORE_IF22]]: -; DEFAULT-NEXT: [[TMP45:%.*]] = add i64 [[INDEX]], 9 -; DEFAULT-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP45]] -; DEFAULT-NEXT: [[TMP47:%.*]] = extractelement <16 x i8> [[TMP7]], i32 9 -; DEFAULT-NEXT: store i8 [[TMP47]], ptr [[TMP46]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE23]] -; DEFAULT: [[PRED_STORE_CONTINUE23]]: -; DEFAULT-NEXT: [[TMP48:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10 -; DEFAULT-NEXT: br i1 [[TMP48]], label %[[PRED_STORE_IF24:.*]], label %[[PRED_STORE_CONTINUE25:.*]] -; DEFAULT: [[PRED_STORE_IF24]]: -; DEFAULT-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], 10 -; DEFAULT-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP49]] -; DEFAULT-NEXT: [[TMP51:%.*]] = extractelement <16 x i8> [[TMP7]], i32 10 -; DEFAULT-NEXT: store i8 [[TMP51]], ptr [[TMP50]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE25]] -; DEFAULT: [[PRED_STORE_CONTINUE25]]: -; DEFAULT-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11 -; DEFAULT-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF26:.*]], label %[[PRED_STORE_CONTINUE27:.*]] -; DEFAULT: [[PRED_STORE_IF26]]: -; DEFAULT-NEXT: [[TMP53:%.*]] = add i64 [[INDEX]], 11 -; DEFAULT-NEXT: [[TMP54:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP53]] -; DEFAULT-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[TMP7]], i32 11 -; DEFAULT-NEXT: store i8 [[TMP55]], ptr [[TMP54]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE27]] -; DEFAULT: [[PRED_STORE_CONTINUE27]]: -; DEFAULT-NEXT: [[TMP56:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12 -; DEFAULT-NEXT: br i1 [[TMP56]], label %[[PRED_STORE_IF28:.*]], label %[[PRED_STORE_CONTINUE29:.*]] -; DEFAULT: [[PRED_STORE_IF28]]: -; DEFAULT-NEXT: [[TMP57:%.*]] = add i64 [[INDEX]], 12 -; DEFAULT-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP57]] -; DEFAULT-NEXT: [[TMP59:%.*]] = extractelement <16 x i8> [[TMP7]], i32 12 -; DEFAULT-NEXT: store i8 [[TMP59]], ptr [[TMP58]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE29]] -; DEFAULT: [[PRED_STORE_CONTINUE29]]: -; DEFAULT-NEXT: [[TMP60:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13 -; DEFAULT-NEXT: br i1 [[TMP60]], label %[[PRED_STORE_IF30:.*]], label %[[PRED_STORE_CONTINUE31:.*]] -; DEFAULT: [[PRED_STORE_IF30]]: -; DEFAULT-NEXT: [[TMP61:%.*]] = add i64 [[INDEX]], 13 -; DEFAULT-NEXT: [[TMP62:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP61]] -; DEFAULT-NEXT: [[TMP63:%.*]] = extractelement <16 x i8> [[TMP7]], i32 13 -; DEFAULT-NEXT: store i8 [[TMP63]], ptr [[TMP62]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE31]] -; DEFAULT: [[PRED_STORE_CONTINUE31]]: -; DEFAULT-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14 -; DEFAULT-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]] -; DEFAULT: [[PRED_STORE_IF32]]: -; DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[INDEX]], 14 -; DEFAULT-NEXT: [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP65]] -; DEFAULT-NEXT: [[TMP67:%.*]] = extractelement <16 x i8> [[TMP7]], i32 14 -; DEFAULT-NEXT: store i8 [[TMP67]], ptr [[TMP66]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE33]] -; DEFAULT: [[PRED_STORE_CONTINUE33]]: -; DEFAULT-NEXT: [[TMP68:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15 -; DEFAULT-NEXT: br i1 [[TMP68]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35]] -; DEFAULT: [[PRED_STORE_IF34]]: -; DEFAULT-NEXT: [[TMP69:%.*]] = add i64 [[INDEX]], 15 +; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] +; DEFAULT: [[FOR_BODY]]: +; DEFAULT-NEXT: [[TMP69:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[TMP0:%.*]] = trunc nuw nsw i64 [[TMP69]] to i8 +; DEFAULT-NEXT: [[MUL:%.*]] = mul i8 [[A]], [[TMP0]] +; DEFAULT-NEXT: [[SHR:%.*]] = lshr i8 [[TMP0]], 1 +; DEFAULT-NEXT: [[MUL5:%.*]] = mul i8 [[SHR]], [[B]] +; DEFAULT-NEXT: [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]] +; DEFAULT-NEXT: [[SHR7:%.*]] = lshr i8 [[TMP0]], 2 +; DEFAULT-NEXT: [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]] +; DEFAULT-NEXT: [[TMP71:%.*]] = add i8 [[ADD]], [[MUL9]] ; DEFAULT-NEXT: [[TMP70:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP69]] -; DEFAULT-NEXT: [[TMP71:%.*]] = extractelement <16 x i8> [[TMP7]], i32 15 ; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1 -; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE35]] -; DEFAULT: [[PRED_STORE_CONTINUE35]]: -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16) -; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16) -; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] +; DEFAULT-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[TMP69]], 1 +; DEFAULT-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15 +; DEFAULT-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY]] ; DEFAULT: [[FOR_COND_CLEANUP]]: ; DEFAULT-NEXT: ret void ; @@ -449,7 +294,7 @@ define void @dont_vectorize_with_minsize() { ; DEFAULT-NEXT: store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2 ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[FOR_COND_CLEANUP]]: @@ -555,7 +400,7 @@ define void @vectorization_forced() { ; DEFAULT-NEXT: store <4 x i16> [[TMP11]], ptr [[TMP9]], align 2 ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; DEFAULT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64 -; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[FOR_COND_CLEANUP]]: diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll index a286df9bc2fc7..c2c04ce6f5ff5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll @@ -85,13 +85,13 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 21 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 43 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 71 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ; ENABLED_MASKED_STRIDED-LABEL: 'test2' @@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index cc84fabd00ecc..002d811d46992 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -435,67 +435,16 @@ define void @test_first_order_recurrence_tried_to_scalarized(ptr %dst, i1 %c, i3 ; CHECK-LABEL: @test_first_order_recurrence_tried_to_scalarized( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[N:%.*]] = select i1 [[C:%.*]], i32 8, i32 9 -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw i32 10, [[TMP5]] -; CHECK-NEXT: store i32 [[TMP6]], ptr [[TMP4]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = sub nsw i32 10, [[TMP10]] -; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK: loop: +; CHECK-NEXT: [[TMP18:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi i32 [ 4, [[ENTRY]] ], [ [[TMP18]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[TMP18]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = sub nsw i32 10, [[TMP15]] -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP14]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], 3 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = sub nsw i32 10, [[TMP20]] -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP19]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw i32, ptr [[DST:%.*]], i32 [[TMP18]] +; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP19]], align 4 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[VECTOR_BODY]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll index 40cd6b63ca8f6..03e0853d29075 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll @@ -253,10 +253,10 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !32 = distinct !DILexicalBlock(scope: !31, file: !5, line: 137, column: 2) !33 = !DILocation(line: 210, column: 44, scope: !32) !34 = !DILocation(line: 320, column: 44, scope: !32) -!35 = distinct !DISubprogram(name: "test_misc", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12) +!35 = distinct !DISubprogram(name: "test_misc", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !2) !36 = distinct !DILexicalBlock(scope: !35, file: !5, line: 137, column: 2) !37 = !DILocation(line: 430, column: 44, scope: !36) !38 = !DILocation(line: 540, column: 44, scope: !36) -!39 = distinct !DISubprogram(name: "test_scalar_Steps", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12) +!39 = distinct !DISubprogram(name: "test_scalar_Steps", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !2) !40 = distinct !DILexicalBlock(scope: !39, file: !5, line: 137, column: 2) !41 = !DILocation(line: 650, column: 44, scope: !40) diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index fe230fa6c9090..b72cbd333cb79 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -49,6 +49,8 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-NEXT: loop.0: ; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext vp<[[PRED1]]> to i32 ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv> +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> +; CHECK-NEXT: WIDEN ir<%add> = add ir<%conv>, ir<%rem> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -57,9 +59,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%conv>, ir<%rem> ; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: @@ -293,27 +293,44 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr ; CHECK-NEXT: loop.0: ; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext vp<[[PRED]]> to i32 ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%0>, ir<%conv> -; CHECK-NEXT: Successor(s): pred.store +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> +; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: -; CHECK: pred.store: { -; CHECK-NEXT: pred.store.entry: +; CHECK: pred.load: { +; CHECK-NEXT: pred.load.entry: ; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-NEXT: Successor(s): pred.load.if, pred.load.continue ; CHECK-EMPTY: -; CHECK: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%lv.2> = load ir<%gep> -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> -; CHECK-NEXT: REPLICATE ir<%conv.lv.2> = sext ir<%lv.2> -; CHECK-NEXT: REPLICATE ir<%add.1> = add ir<%conv>, ir<%rem> -; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE ir<%add> = add ir<%add.1>, ir<%conv.lv.2> -; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> -; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK: pred.load.if: +; CHECK-NEXT: REPLICATE ir<%lv.2> = load ir<%gep> (S->V) +; CHECK-NEXT: Successor(s): pred.load.continue ; CHECK-EMPTY: -; CHECK: pred.store.continue: +; CHECK: pred.load.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%9> = ir<%lv.2> ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.2 +; CHECK-NEXT: Successor(s): loop.1 +; CHECK-EMPTY: +; CHECK-NEXT: loop.1: +; CHECK-NEXT: WIDEN ir<%add.1> = add ir<%conv>, ir<%rem> +; CHECK-NEXT: WIDEN-CAST ir<%conv.lv.2> = sext vp<%9> to i32 +; CHECK-NEXT: WIDEN ir<%add> = add ir<%add.1>, ir<%conv.lv.2> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[STEPS]]> +; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep.dst> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): loop.2 ; CHECK-EMPTY: ; CHECK: loop.2: ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> @@ -377,6 +394,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next> +; CHECK-NEXT: WIDEN ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -386,7 +404,6 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE ir<%rem.div> = sdiv ir<20>, ir<%rem> ; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE store ir<%rem.div>, ir<%gep> @@ -457,6 +474,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp ule vp<[[WIDE_IV]]>, vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<[[L]]> = load ir<%src> ; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%.pn>, ir<[[L]]> +; CHECK-NEXT: WIDEN ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -467,7 +485,6 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: vp<[[SCALAR_STEPS:%.+]]> = SCALAR-STEPS vp<[[DERIVED_IV]]>, ir<1>, vp<[[VF]]> ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<[[SCALAR_STEPS]]> -; CHECK-NEXT: REPLICATE ir<%val> = sdiv vp<[[SPLICE]]>, ir<%x> ; CHECK-NEXT: REPLICATE store ir<%val>, ir<%gep.dst> ; CHECK-NEXT: Successor(s): pred.store.continue ; CHECK-EMPTY: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 2dd6a04ee7d4a..3161a0d5e6f5e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -debug -disable-output %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -debug -disable-output %s 2>&1 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll index aaabd18958fae..618ec86ebd35d 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks-inlining.ll @@ -118,18 +118,18 @@ declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32) !4 = !{i32 2, !"Debug Info Version", i32 3} !5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) !17 = !DIFile(filename: "toplevel.c", directory: "/test") -!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !18 = !DIFile(filename: "assign.h", directory: "/test") -!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !20 = !DIFile(filename: "add.h", directory: "/test") -!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !22 = !DIFile(filename: "store.h", directory: "/test") -!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !24 = !DIFile(filename: "transpose.h", directory: "/test") -!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !6 = !DISubroutineType(types: !7) diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll index 628ff08b81679..ff41c57055bff 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/remarks.ll @@ -163,26 +163,26 @@ declare void @llvm.matrix.column.major.store(<9 x double>, ptr, i64, i1, i32, i3 !19 = !DILocation(line: 10, column: 20, scope: !5) !20 = !DILocation(line: 10, column: 10, scope: !5) -!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !22 = !DILocation(line: 30, column: 20, scope: !21) -!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !24 = !DILocation(line: 40, column: 20, scope: !23) -!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !26 = !DILocation(line: 50, column: 20, scope: !25) -!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !28 = !DILocation(line: 60, column: 20, scope: !27) -!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !30 = !DILocation(line: 70, column: 20, scope: !29) -!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !32 = !DILocation(line: 80, column: 20, scope: !31) -!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !34 = !DILocation(line: 90, column: 20, scope: !33) -!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12) +!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2) !36 = !DILocation(line: 100, column: 20, scope: !35) diff --git a/llvm/test/Transforms/SROA/heterogeneous-poison.ll b/llvm/test/Transforms/SROA/heterogeneous-poison.ll index b4d1e80833c11..63264d2aad969 100644 --- a/llvm/test/Transforms/SROA/heterogeneous-poison.ll +++ b/llvm/test/Transforms/SROA/heterogeneous-poison.ll @@ -7,9 +7,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80: %struct.pair = type { i32, i32 } define i32 @t1() !dbg !9 { -; CHECK-LABEL: define i32 @t1( -; CHECK-SAME: ) !dbg [[DBG9:![0-9]+]] { -; CHECK-NEXT: #dbg_value(i32 2, [[META13:![0-9]+]], !DIExpression(DIOpArg(0, i32)), [[META15:![0-9]+]]) +; CHECK-LABEL: define i32 @t1() { ; CHECK-NEXT: ret i32 2 ; %local = alloca i32, align 4 @@ -21,17 +19,14 @@ define i32 @t1() !dbg !9 { define i32 @t2(i1 %cond) !dbg !16 { ; CHECK-LABEL: define i32 @t2( -; CHECK-SAME: i1 [[COND:%.*]]) !dbg [[DBG16:![0-9]+]] { +; CHECK-SAME: i1 [[COND:%.*]]) { ; CHECK-NEXT: br i1 [[COND]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: -; CHECK-NEXT: #dbg_value(i32 42, [[META17:![0-9]+]], !DIExpression(DIOpArg(0, i32)), [[META18:![0-9]+]]) ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: #dbg_value(i32 2, [[META17]], !DIExpression(DIOpArg(0, i32)), [[META18]]) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[LOCAL_0:%.*]] = phi i32 [ 42, [[THEN]] ], [ 2, [[ELSE]] ] -; CHECK-NEXT: #dbg_value(i32 [[LOCAL_0]], [[META17]], !DIExpression(DIOpArg(0, i32)), [[META18]]) ; CHECK-NEXT: ret i32 [[LOCAL_0]] ; %local = alloca i32, align 4 @@ -52,10 +47,7 @@ join: ; preds = %else, %then } define void @t3() !dbg !19 { -; CHECK-LABEL: define void @t3( -; CHECK-SAME: ) !dbg [[DBG19:![0-9]+]] { -; CHECK-NEXT: #dbg_value(i32 42, [[META20:![0-9]+]], !DIExpression(DIOpArg(0, i32), DIOpFragment(0, 32)), [[META25:![0-9]+]]) -; CHECK-NEXT: #dbg_value(i32 43, [[META20]], !DIExpression(DIOpArg(0, i32), DIOpFragment(32, 32)), [[META25]]) +; CHECK-LABEL: define void @t3() { ; CHECK-NEXT: ret void ; %local = alloca %struct.pair, align 4 @@ -70,9 +62,7 @@ define void @t3() !dbg !19 { define i32 @t4() !dbg !26 { ;; FIXME(diexpression-poison): We could probably preserve debug info for the dbg.value here if ;; necessary. Check that we at least do something sensible with it for now. -; CHECK-LABEL: define i32 @t4( -; CHECK-SAME: ) !dbg [[DBG26:![0-9]+]] { -; CHECK-NEXT: #dbg_value(ptr poison, [[META27:![0-9]+]], !DIExpression(DIOpArg(0, ptr addrspace(5)), DIOpDeref(i32)), [[META28:![0-9]+]]) +; CHECK-LABEL: define i32 @t4() { ; CHECK-NEXT: ret i32 42 ; %local = alloca i32, align 4 @@ -87,17 +77,14 @@ define i16 @t5(i1 %cond) !dbg !29 { ;; of the variable !30. This is something that old-style DIExpressions don't ;; support. ; CHECK-LABEL: define i16 @t5( -; CHECK-SAME: i1 [[COND:%.*]]) !dbg [[DBG29:![0-9]+]] { +; CHECK-SAME: i1 [[COND:%.*]]) { ; CHECK-NEXT: br i1 [[COND]], label [[THEN:%.*]], label [[ELSE:%.*]] ; CHECK: then: -; CHECK-NEXT: #dbg_value(i16 42, [[META30:![0-9]+]], !DIExpression(DIOpArg(0, i16), DIOpSExt(i32)), [[META31:![0-9]+]]) ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: #dbg_value(i16 43, [[META30]], !DIExpression(DIOpArg(0, i16), DIOpSExt(i32)), [[META31]]) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[LOCAL_0:%.*]] = phi i16 [ 42, [[THEN]] ], [ 43, [[ELSE]] ] -; CHECK-NEXT: #dbg_value(i16 [[LOCAL_0]], [[META30]], !DIExpression(DIOpArg(0, i16), DIOpSExt(i32)), [[META31]]) ; CHECK-NEXT: ret i16 [[LOCAL_0]] ; %local = alloca i16, align 4 @@ -120,12 +107,7 @@ join: ; preds = %else, %then %struct.pair.pair = type { %struct.pair, %struct.pair } define void @t6() !dbg !32 { -; CHECK-LABEL: define void @t6( -; CHECK-SAME: ) !dbg [[DBG32:![0-9]+]] { -; CHECK-NEXT: #dbg_value(i32 0, [[META33:![0-9]+]], !DIExpression(DIOpArg(0, i32), DIOpFragment(0, 32)), [[META38:![0-9]+]]) -; CHECK-NEXT: #dbg_value(i32 1, [[META33]], !DIExpression(DIOpArg(0, i32), DIOpFragment(32, 32)), [[META38]]) -; CHECK-NEXT: #dbg_value(i32 2, [[META33]], !DIExpression(DIOpArg(0, i32), DIOpFragment(64, 32)), [[META38]]) -; CHECK-NEXT: #dbg_value(i32 3, [[META33]], !DIExpression(DIOpArg(0, i32), DIOpFragment(96, 32)), [[META38]]) +; CHECK-LABEL: define void @t6() { ; CHECK-NEXT: ret void ; %first = alloca %struct.pair, align 4 @@ -195,37 +177,3 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo !37 = !DILocalVariable(name: "local", scope: !32, file: !1, line: 1, type: !33) !38 = !DILocation(line: 1, column: 1, scope: !32) -;. -; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "clang 19", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) -; CHECK: [[META1]] = !DIFile(filename: "{{.*}}t.cpp", directory: {{.*}}) -; CHECK: [[DBG9]] = distinct !DISubprogram(name: "t1", linkageName: "t1", scope: [[META1]], file: [[META1]], line: 7, type: [[META10:![0-9]+]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12:![0-9]+]]) -; CHECK: [[META10]] = !DISubroutineType(types: [[META11:![0-9]+]]) -; CHECK: [[META11]] = !{null} -; CHECK: [[META12]] = !{[[META13]]} -; CHECK: [[META13]] = !DILocalVariable(name: "local", scope: [[DBG9]], file: [[META1]], line: 8, type: [[META14:![0-9]+]]) -; CHECK: [[META14]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -; CHECK: [[META15]] = !DILocation(line: 0, scope: [[DBG9]]) -; CHECK: [[DBG16]] = distinct !DISubprogram(name: "t2", linkageName: "t2", scope: [[META1]], file: [[META1]], line: 7, type: [[META10]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12]]) -; CHECK: [[META17]] = !DILocalVariable(name: "local", scope: [[DBG16]], file: [[META1]], line: 1, type: [[META14]]) -; CHECK: [[META18]] = !DILocation(line: 0, scope: [[DBG16]]) -; CHECK: [[DBG19]] = distinct !DISubprogram(name: "t3", linkageName: "t3", scope: [[META1]], file: [[META1]], line: 7, type: [[META10]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12]]) -; CHECK: [[META20]] = !DILocalVariable(name: "local", scope: [[DBG19]], file: [[META1]], line: 1, type: [[META21:![0-9]+]]) -; CHECK: [[META21]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: [[META1]], line: 2, size: 64, flags: DIFlagTypePassByValue, elements: [[META22:![0-9]+]], identifier: "pair") -; CHECK: [[META22]] = !{[[META23:![0-9]+]], [[META24:![0-9]+]]} -; CHECK: [[META23]] = !DIDerivedType(tag: DW_TAG_member, name: "s1", scope: [[META21]], file: [[META1]], line: 3, baseType: [[META14]], size: 32) -; CHECK: [[META24]] = !DIDerivedType(tag: DW_TAG_member, name: "s2", scope: [[META21]], file: [[META1]], line: 4, baseType: [[META14]], size: 32, offset: 32) -; CHECK: [[META25]] = !DILocation(line: 0, scope: [[DBG19]]) -; CHECK: [[DBG26]] = distinct !DISubprogram(name: "t4", linkageName: "t4", scope: [[META1]], file: [[META1]], line: 7, type: [[META10]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12]]) -; CHECK: [[META27]] = !DILocalVariable(name: "local", scope: [[DBG26]], file: [[META1]], line: 1, type: [[META14]]) -; CHECK: [[META28]] = !DILocation(line: 1, column: 1, scope: [[DBG26]]) -; CHECK: [[DBG29]] = distinct !DISubprogram(name: "t5", linkageName: "t5", scope: [[META1]], file: [[META1]], line: 7, type: [[META10]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12]]) -; CHECK: [[META30]] = !DILocalVariable(name: "local_i16", scope: [[DBG29]], file: [[META1]], line: 1, type: [[META14]]) -; CHECK: [[META31]] = !DILocation(line: 0, scope: [[DBG29]]) -; CHECK: [[DBG32]] = distinct !DISubprogram(name: "t6", linkageName: "t56", scope: [[META1]], file: [[META1]], line: 7, type: [[META10]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12]]) -; CHECK: [[META33]] = !DILocalVariable(name: "local", scope: [[DBG32]], file: [[META1]], line: 1, type: [[META34:![0-9]+]]) -; CHECK: [[META34]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair_pair", file: [[META1]], line: 2, size: 128, flags: DIFlagTypePassByValue, elements: [[META35:![0-9]+]], identifier: "pair_pair") -; CHECK: [[META35]] = !{[[META36:![0-9]+]], [[META37:![0-9]+]]} -; CHECK: [[META36]] = !DIDerivedType(tag: DW_TAG_member, name: "s1", scope: [[META34]], file: [[META1]], line: 3, baseType: [[META21]], size: 64) -; CHECK: [[META37]] = !DIDerivedType(tag: DW_TAG_member, name: "s2", scope: [[META34]], file: [[META1]], line: 4, baseType: [[META21]], size: 64, offset: 64) -; CHECK: [[META38]] = !DILocation(line: 0, scope: [[DBG32]]) -;. diff --git a/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll b/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll index a0fa79aa7edbe..7fc72077ee5b3 100644 --- a/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll +++ b/llvm/test/Transforms/Util/annotation-remarks-dbg-info.ll @@ -72,5 +72,5 @@ entry: !14 = !{!15} !15 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 1, type: !10) !16 = !DILocation(line: 400, column: 3, scope: !7) -!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 21, type: !8, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14) +!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 21, type: !8, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) !18 = !DILocation(line: 200, column: 3, scope: !17) diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp index 6dc647655d92f..77b2f491f6a72 100644 --- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp @@ -28,7 +28,6 @@ #include "llvm/TargetParser/Host.h" #include -#include #include #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && \ !defined(_M_ARM64EC) diff --git a/llvm/tools/llvm-gpu-loader/amdhsa.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp index 5715058d8cfac..fa9ee185549a5 100644 --- a/llvm/tools/llvm-gpu-loader/amdhsa.cpp +++ b/llvm/tools/llvm-gpu-loader/amdhsa.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #include // The implicit arguments of COV5 AMDGPU kernels. diff --git a/llvm/tools/llvm-mc/Disassembler.h b/llvm/tools/llvm-mc/Disassembler.h index dd8525d73200e..76cee9e84c312 100644 --- a/llvm/tools/llvm-mc/Disassembler.h +++ b/llvm/tools/llvm-mc/Disassembler.h @@ -14,8 +14,6 @@ #ifndef LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H #define LLVM_TOOLS_LLVM_MC_DISASSEMBLER_H -#include - namespace llvm { class MemoryBuffer; diff --git a/llvm/tools/llvm-xray/trie-node.h b/llvm/tools/llvm-xray/trie-node.h index b42b0293620dd..f96be592fc364 100644 --- a/llvm/tools/llvm-xray/trie-node.h +++ b/llvm/tools/llvm-xray/trie-node.h @@ -15,7 +15,6 @@ #define LLVM_TOOLS_LLVM_XRAY_STACK_TRIE_H #include -#include #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp index c6ae6e36cfbff..d0092fdb52b01 100644 --- a/llvm/unittests/ADT/STLForwardCompatTest.cpp +++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp @@ -11,7 +11,6 @@ #include "gtest/gtest.h" #include -#include #include #include diff --git a/llvm/unittests/ADT/SequenceTest.cpp b/llvm/unittests/ADT/SequenceTest.cpp index 7aa39568888b2..ab50ad0bb5606 100644 --- a/llvm/unittests/ADT/SequenceTest.cpp +++ b/llvm/unittests/ADT/SequenceTest.cpp @@ -11,7 +11,9 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" -#include +#include +#include +#include using namespace llvm; diff --git a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp index b40b61e7a4a6f..441344b281411 100644 --- a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp @@ -23,7 +23,6 @@ #include "gtest/gtest.h" #include -#include #include #include diff --git a/llvm/unittests/IR/VFABIDemanglerTest.cpp b/llvm/unittests/IR/VFABIDemanglerTest.cpp index e30e0f865f719..7d946131d4cba 100644 --- a/llvm/unittests/IR/VFABIDemanglerTest.cpp +++ b/llvm/unittests/IR/VFABIDemanglerTest.cpp @@ -15,7 +15,6 @@ #include "llvm/IR/Module.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" -#include using namespace llvm; diff --git a/llvm/utils/TableGen/Common/CodeGenHwModes.h b/llvm/utils/TableGen/Common/CodeGenHwModes.h index 55062b6ebeb35..7e45f8ef1de49 100644 --- a/llvm/utils/TableGen/Common/CodeGenHwModes.h +++ b/llvm/utils/TableGen/Common/CodeGenHwModes.h @@ -15,7 +15,6 @@ #include "llvm/ADT/StringRef.h" #include #include -#include #include #include diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.h b/llvm/utils/TableGen/Common/CodeGenTarget.h index b6f9d4209a13e..c1ed70d1739f0 100644 --- a/llvm/utils/TableGen/Common/CodeGenTarget.h +++ b/llvm/utils/TableGen/Common/CodeGenTarget.h @@ -28,7 +28,6 @@ #include "llvm/CodeGenTypes/MachineValueType.h" #include #include -#include #include #include diff --git a/llvm/utils/TableGen/TableGenBackends.h b/llvm/utils/TableGen/TableGenBackends.h index 6e13864d12e16..261e17172abee 100644 --- a/llvm/utils/TableGen/TableGenBackends.h +++ b/llvm/utils/TableGen/TableGenBackends.h @@ -15,8 +15,6 @@ #ifndef LLVM_UTILS_TABLEGEN_TABLEGENBACKENDS_H #define LLVM_UTILS_TABLEGEN_TABLEGENBACKENDS_H -#include - // A TableGen backend is a function that looks like // // EmitFoo(RecordKeeper &RK, raw_ostream &OS /*, anything else you need */ ) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 9c35c07a7e587..3f27d690f949b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -379,6 +379,17 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> { ); let builders = [ + AttrBuilder<(ins "llvm::ArrayRef": $inst_data), + [{ + auto sg_layout = DenseI32ArrayAttr(); + auto sg_data = DenseI32ArrayAttr(); + auto order = DenseI32ArrayAttr(); + auto lane_layout = DenseI32ArrayAttr(); + auto lane_data = DenseI32ArrayAttr(); + return $_get($_ctxt, sg_layout, sg_data, + DenseI32ArrayAttr::get($_ctxt, inst_data), + lane_layout, lane_data, order); + }]>, AttrBuilder<(ins "llvm::ArrayRef": $inst_data, "llvm::ArrayRef": $lane_layout, "llvm::ArrayRef": $lane_data), diff --git a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td index b985d5450be0e..34f333e556deb 100644 --- a/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.td @@ -16,6 +16,24 @@ include "mlir/Dialect/Transform/IR/TransformTypes.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" +def GetDescOp : Op, + NavigationTransformOpTrait, MemoryEffectsOpInterface +]> { + + let summary = "Get a handle to the descriptor op of a value."; + let description = [{ + Traces the producers of the given value until an `xegpu.create_nd_tdesc` + descriptor op is found. Returns a handle to it. Currently traces + producers by following only the first operand of producer ops. + }]; + + let arguments = (ins TransformValueHandleTypeInterface:$target); + + let results = (outs TransformHandleTypeInterface:$descHandle); + let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)"; +} + def SetDescLayoutOp : Op, @@ -31,16 +49,16 @@ def SetDescLayoutOp : Op : $sg_layout, - Variadic : $sg_data, - Variadic : $inst_data, + TransformHandleTypeInterface:$target, + Variadic:$sg_layout, + Variadic:$sg_data, + Variadic:$inst_data, DefaultValuedOptionalAttr:$static_sg_layout, DefaultValuedOptionalAttr:$static_sg_data, DefaultValuedOptionalAttr:$static_inst_data ); - let results = (outs TransformHandleTypeInterface : $transformed); + let results = (outs TransformHandleTypeInterface:$transformed); let builders = [ OpBuilder<(ins "Value":$target, "ArrayRef":$mixedSgLayout, @@ -78,4 +96,69 @@ def SetDescLayoutOp : Op, + TransformOpInterface +]> { + + let summary = "Set xegpu.layout attribute of an op."; + let description = [{ + Sets the `xegpu.layout` attribute of an op. If `result=true`, sets the + `layout_result_{index}`, otherwise `layout_operand_{index}` attribute. The + target operand/result value is defined by the `index` argument. The layout + is defined by the `sg_layout`, `sg_data` and optional `inst_data` attributes. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target, + DefaultValuedOptionalAttr:$index, + Variadic:$sg_layout, + Variadic:$sg_data, + Variadic:$inst_data, + DefaultValuedOptionalAttr:$static_sg_layout, + DefaultValuedOptionalAttr:$static_sg_data, + DefaultValuedOptionalAttr:$static_inst_data, + DefaultValuedAttr:$result + ); + + let results = (outs); + let builders = [ + OpBuilder<(ins "Value":$target, + "int64_t":$index, + "ArrayRef":$mixedSgLayout, + "ArrayRef":$mixedSgData, + "ArrayRef":$mixedInstData, + CArg<"bool", "false">:$result + )>, + ]; + + let assemblyFormat = [{ + $target (`result` $result^)? (`index` `=` $index^)? + `sg_layout` `=` custom($sg_layout, $static_sg_layout) + `sg_data` `=` custom($sg_data, $static_sg_data) + (`inst_data` `=` custom($inst_data, $static_inst_data)^)? + attr-dict `:` qualified(type(operands)) + }]; + + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure apply( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::transform::TransformResults &transformResults, + ::mlir::transform::TransformState &state); + + ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgLayout() { + Builder b(getContext()); + return getMixedValues(getStaticSgLayout(), getSgLayout(), b); + } + ::llvm::SmallVector<::mlir::OpFoldResult> getMixedSgData() { + Builder b(getContext()); + return getMixedValues(getStaticSgData(), getSgData(), b); + } + ::llvm::SmallVector<::mlir::OpFoldResult> getMixedInstData() { + Builder b(getContext()); + return getMixedValues(getStaticInstData(), getInstData(), b); + } + }]; +} + #endif // XEGPU_TRANSFORM_OPS diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index e42799689e490..12270af870b3b 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -47,7 +47,7 @@ def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> { Option< "layoutKind", "layout-kind", "std::string", /*default=*/"\"lane\"", - "Propagate a `sg` / `inst` / `lane` level of xegpu layouts."> + "Propagate `inst` / `lane` level of xegpu layouts."> ]; } diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h index b7394387b0f9a..79dfd7a2795f0 100644 --- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h +++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h @@ -359,6 +359,20 @@ class MlirOptMainConfig { /// the loaded IR. using PassPipelineFn = llvm::function_ref; +/// Register basic command line options. +/// - toolName is used for the header displayed by `--help`. +/// - registry should contain all the dialects that can be parsed in the source. +/// - return std::string for help header. +std::string registerCLIOptions(llvm::StringRef toolName, + DialectRegistry ®istry); + +/// Parse command line options. +/// - helpHeader is used for the header displayed by `--help`. +/// - return std::pair for +/// inputFilename and outputFilename command line option values. +std::pair parseCLIOptions(int argc, char **argv, + llvm::StringRef helpHeader); + /// Register and parse command line options. /// - toolName is used for the header displayed by `--help`. /// - registry should contain all the dialects that can be parsed in the source. diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index cda4fe19c16f8..d90f27bd037e6 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -32,33 +32,6 @@ using llvm::SmallVector; using llvm::StringRef; using llvm::Twine; -//------------------------------------------------------------------------------ -// Docstrings (trivial, non-duplicated docstrings are included inline). -//------------------------------------------------------------------------------ - -static const char kContextParseTypeDocstring[] = - R"(Parses the assembly form of a type. - -Returns a Type object or raises an MLIRError if the type cannot be parsed. - -See also: https://mlir.llvm.org/docs/LangRef/#type-system -)"; - -static const char kContextGetCallSiteLocationDocstring[] = - R"(Gets a Location representing a caller and callsite)"; - -static const char kContextGetFileLocationDocstring[] = - R"(Gets a Location representing a file, line and column)"; - -static const char kContextGetFileRangeDocstring[] = - R"(Gets a Location representing a file, line and column range)"; - -static const char kContextGetFusedLocationDocstring[] = - R"(Gets a Location representing a fused location with optional metadata)"; - -static const char kContextGetNameLocationDocString[] = - R"(Gets a Location representing a named location with optional child location)"; - static const char kModuleParseDocstring[] = R"(Parses a module's assembly format from a string. @@ -67,132 +40,12 @@ Returns a new MlirModule or raises an MLIRError if the parsing fails. See also: https://mlir.llvm.org/docs/LangRef/ )"; -static const char kModuleCAPICreate[] = - R"(Creates a Module from a MlirModule wrapped by a capsule (i.e. module._CAPIPtr). -Note this returns a new object BUT _clear_mlir_module(module) must be called to -prevent double-frees (of the underlying mlir::Module). -)"; - -static const char kOperationCreateDocstring[] = - R"(Creates a new operation. - -Args: - name: Operation name (e.g. "dialect.operation"). - results: Sequence of Type representing op result types. - attributes: Dict of str:Attribute. - successors: List of Block for the operation's successors. - regions: Number of regions to create. - location: A Location object (defaults to resolve from context manager). - ip: An InsertionPoint (defaults to resolve from context manager or set to - False to disable insertion, even with an insertion point set in the - context manager). - infer_type: Whether to infer result types. -Returns: - A new "detached" Operation object. Detached operations can be added - to blocks, which causes them to become "attached." -)"; - -static const char kOperationPrintDocstring[] = - R"(Prints the assembly form of the operation to a file like object. - -Args: - file: The file like object to write to. Defaults to sys.stdout. - binary: Whether to write bytes (True) or str (False). Defaults to False. - large_elements_limit: Whether to elide elements attributes above this - number of elements. Defaults to None (no limit). - large_resource_limit: Whether to elide resource attributes above this - number of characters. Defaults to None (no limit). If large_elements_limit - is set and this is None, the behavior will be to use large_elements_limit - as large_resource_limit. - enable_debug_info: Whether to print debug/location information. Defaults - to False. - pretty_debug_info: Whether to format debug information for easier reading - by a human (warning: the result is unparseable). - print_generic_op_form: Whether to print the generic assembly forms of all - ops. Defaults to False. - use_local_Scope: Whether to print in a way that is more optimized for - multi-threaded access but may not be consistent with how the overall - module prints. - assume_verified: By default, if not printing generic form, the verifier - will be run and if it fails, generic form will be printed with a comment - about failed verification. While a reasonable default for interactive use, - for systematic use, it is often better for the caller to verify explicitly - and report failures in a more robust fashion. Set this to True if doing this - in order to avoid running a redundant verification. If the IR is actually - invalid, behavior is undefined. - skip_regions: Whether to skip printing regions. Defaults to False. -)"; - -static const char kOperationPrintStateDocstring[] = - R"(Prints the assembly form of the operation to a file like object. - -Args: - file: The file like object to write to. Defaults to sys.stdout. - binary: Whether to write bytes (True) or str (False). Defaults to False. - state: AsmState capturing the operation numbering and flags. -)"; - -static const char kOperationGetAsmDocstring[] = - R"(Gets the assembly form of the operation with all options available. - -Args: - binary: Whether to return a bytes (True) or str (False) object. Defaults to - False. - ... others ...: See the print() method for common keyword arguments for - configuring the printout. -Returns: - Either a bytes or str object, depending on the setting of the 'binary' - argument. -)"; - -static const char kOperationPrintBytecodeDocstring[] = - R"(Write the bytecode form of the operation to a file like object. - -Args: - file: The file like object to write to. - desired_version: The version of bytecode to emit. -Returns: - The bytecode writer status. -)"; - -static const char kOperationStrDunderDocstring[] = - R"(Gets the assembly form of the operation with default options. - -If more advanced control over the assembly formatting or I/O options is needed, -use the dedicated print or get_asm method, which supports keyword arguments to -customize behavior. -)"; - static const char kDumpDocstring[] = - R"(Dumps a debug representation of the object to stderr.)"; - -static const char kAppendBlockDocstring[] = - R"(Appends a new block, with argument types as positional args. - -Returns: - The created block. -)"; - -static const char kValueDunderStrDocstring[] = - R"(Returns the string form of the value. - -If the value is a block argument, this is the assembly form of its type and the -position in the argument list. If the value is an operation result, this is -equivalent to printing the operation that produced it. -)"; - -static const char kGetNameAsOperand[] = - R"(Returns the string form of value as an operand (i.e., the ValueID). -)"; - -static const char kValueReplaceAllUsesWithDocstring[] = - R"(Replace all uses of value with the new value, updating anything in -the IR that uses 'self' to use the other value instead. -)"; + "Dumps a debug representation of the object to stderr."; static const char kValueReplaceAllUsesExceptDocstring[] = - R"("Replace all uses of this value with the 'with' value, except for those -in 'exceptions'. 'exceptions' can be either a single operation or a list of + R"(Replace all uses of this value with the `with` value, except for those +in `exceptions`. `exceptions` can be either a single operation or a list of operations. )"; @@ -274,22 +127,26 @@ struct PyGlobalDebugFlag { // Debug flags. nb::class_(m, "_GlobalDebug") .def_prop_rw_static("flag", &PyGlobalDebugFlag::get, - &PyGlobalDebugFlag::set, "LLVM-wide debug flag") + &PyGlobalDebugFlag::set, "LLVM-wide debug flag.") .def_static( "set_types", [](const std::string &type) { nb::ft_lock_guard lock(mutex); mlirSetGlobalDebugType(type.c_str()); }, - "types"_a, "Sets specific debug types to be produced by LLVM") - .def_static("set_types", [](const std::vector &types) { - std::vector pointers; - pointers.reserve(types.size()); - for (const std::string &str : types) - pointers.push_back(str.c_str()); - nb::ft_lock_guard lock(mutex); - mlirSetGlobalDebugTypes(pointers.data(), pointers.size()); - }); + "types"_a, "Sets specific debug types to be produced by LLVM.") + .def_static( + "set_types", + [](const std::vector &types) { + std::vector pointers; + pointers.reserve(types.size()); + for (const std::string &str : types) + pointers.push_back(str.c_str()); + nb::ft_lock_guard lock(mutex); + mlirSetGlobalDebugTypes(pointers.data(), pointers.size()); + }, + "types"_a, + "Sets multiple specific debug types to be produced by LLVM."); } private: @@ -316,12 +173,18 @@ struct PyAttrBuilderMap { static void bind(nb::module_ &m) { nb::class_(m, "AttrBuilder") - .def_static("contains", &PyAttrBuilderMap::dunderContains) - .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed) + .def_static("contains", &PyAttrBuilderMap::dunderContains, + "attribute_kind"_a, + "Checks whether an attribute builder is registered for the " + "given attribute kind.") + .def_static("get", &PyAttrBuilderMap::dunderGetItemNamed, + "attribute_kind"_a, + "Gets the registered attribute builder for the given " + "attribute kind.") .def_static("insert", &PyAttrBuilderMap::dunderSetItemNamed, "attribute_kind"_a, "attr_builder"_a, "replace"_a = false, "Register an attribute builder for building MLIR " - "attributes from python values."); + "attributes from Python values."); } }; @@ -357,8 +220,10 @@ class PyRegionIterator { static void bind(nb::module_ &m) { nb::class_(m, "RegionIterator") - .def("__iter__", &PyRegionIterator::dunderIter) - .def("__next__", &PyRegionIterator::dunderNext); + .def("__iter__", &PyRegionIterator::dunderIter, + "Returns an iterator over the regions in the operation.") + .def("__next__", &PyRegionIterator::dunderNext, + "Returns the next region in the iteration."); } private: @@ -386,7 +251,8 @@ class PyRegionList : public Sliceable { } static void bindDerived(ClassTy &c) { - c.def("__iter__", &PyRegionList::dunderIter); + c.def("__iter__", &PyRegionList::dunderIter, + "Returns an iterator over the regions in the sequence."); } private: @@ -430,8 +296,10 @@ class PyBlockIterator { static void bind(nb::module_ &m) { nb::class_(m, "BlockIterator") - .def("__iter__", &PyBlockIterator::dunderIter) - .def("__next__", &PyBlockIterator::dunderNext); + .def("__iter__", &PyBlockIterator::dunderIter, + "Returns an iterator over the blocks in the operation's region.") + .def("__next__", &PyBlockIterator::dunderNext, + "Returns the next block in the iteration."); } private: @@ -493,10 +361,19 @@ class PyBlockList { static void bind(nb::module_ &m) { nb::class_(m, "BlockList") - .def("__getitem__", &PyBlockList::dunderGetItem) - .def("__iter__", &PyBlockList::dunderIter) - .def("__len__", &PyBlockList::dunderLen) - .def("append", &PyBlockList::appendBlock, kAppendBlockDocstring, + .def("__getitem__", &PyBlockList::dunderGetItem, + "Returns the block at the specified index.") + .def("__iter__", &PyBlockList::dunderIter, + "Returns an iterator over blocks in the operation's region.") + .def("__len__", &PyBlockList::dunderLen, + "Returns the number of blocks in the operation's region.") + .def("append", &PyBlockList::appendBlock, + R"( + Appends a new block, with argument types as positional args. + + Returns: + The created block. + )", nb::arg("args"), nb::kw_only(), nb::arg("arg_locs") = std::nullopt); } @@ -527,8 +404,10 @@ class PyOperationIterator { static void bind(nb::module_ &m) { nb::class_(m, "OperationIterator") - .def("__iter__", &PyOperationIterator::dunderIter) - .def("__next__", &PyOperationIterator::dunderNext); + .def("__iter__", &PyOperationIterator::dunderIter, + "Returns an iterator over the operations in an operation's block.") + .def("__next__", &PyOperationIterator::dunderNext, + "Returns the next operation in the iteration."); } private: @@ -584,9 +463,12 @@ class PyOperationList { static void bind(nb::module_ &m) { nb::class_(m, "OperationList") - .def("__getitem__", &PyOperationList::dunderGetItem) - .def("__iter__", &PyOperationList::dunderIter) - .def("__len__", &PyOperationList::dunderLen); + .def("__getitem__", &PyOperationList::dunderGetItem, + "Returns the operation at the specified index.") + .def("__iter__", &PyOperationList::dunderIter, + "Returns an iterator over operations in the list.") + .def("__len__", &PyOperationList::dunderLen, + "Returns the number of operations in the list."); } private: @@ -609,8 +491,10 @@ class PyOpOperand { static void bind(nb::module_ &m) { nb::class_(m, "OpOperand") - .def_prop_ro("owner", &PyOpOperand::getOwner) - .def_prop_ro("operand_number", &PyOpOperand::getOperandNumber); + .def_prop_ro("owner", &PyOpOperand::getOwner, + "Returns the operation that owns this operand.") + .def_prop_ro("operand_number", &PyOpOperand::getOperandNumber, + "Returns the operand number in the owning operation."); } private: @@ -634,8 +518,10 @@ class PyOpOperandIterator { static void bind(nb::module_ &m) { nb::class_(m, "OpOperandIterator") - .def("__iter__", &PyOpOperandIterator::dunderIter) - .def("__next__", &PyOpOperandIterator::dunderNext); + .def("__iter__", &PyOpOperandIterator::dunderIter, + "Returns an iterator over operands.") + .def("__next__", &PyOpOperandIterator::dunderNext, + "Returns the next operand in the iteration."); } private: @@ -1626,16 +1512,21 @@ class PyOpResult : public PyConcreteValue { static void bindDerived(ClassTy &c) { c.def_prop_ro( - "owner", [](PyOpResult &self) -> nb::typed { + "owner", + [](PyOpResult &self) -> nb::typed { assert(mlirOperationEqual(self.getParentOperation()->get(), mlirOpResultGetOwner(self.get())) && "expected the owner of the value in Python to match that in " "the IR"); return self.getParentOperation().getObject(); - }); - c.def_prop_ro("result_number", [](PyOpResult &self) { - return mlirOpResultGetResultNumber(self.get()); - }); + }, + "Returns the operation that produces this result."); + c.def_prop_ro( + "result_number", + [](PyOpResult &self) { + return mlirOpResultGetResultNumber(self.get()); + }, + "Returns the position of this result in the operation's result list."); } }; @@ -1671,13 +1562,18 @@ class PyOpResultList : public Sliceable { operation(std::move(operation)) {} static void bindDerived(ClassTy &c) { - c.def_prop_ro("types", [](PyOpResultList &self) { - return getValueTypes(self, self.operation->getContext()); - }); - c.def_prop_ro("owner", - [](PyOpResultList &self) -> nb::typed { - return self.operation->createOpView(); - }); + c.def_prop_ro( + "types", + [](PyOpResultList &self) { + return getValueTypes(self, self.operation->getContext()); + }, + "Returns a list of types for all results in this result list."); + c.def_prop_ro( + "owner", + [](PyOpResultList &self) -> nb::typed { + return self.operation->createOpView(); + }, + "Returns the operation that owns this result list."); } PyOperationRef &getOperation() { return operation; } @@ -2427,19 +2323,25 @@ class PyBlockArgument : public PyConcreteValue { using PyConcreteValue::PyConcreteValue; static void bindDerived(ClassTy &c) { - c.def_prop_ro("owner", [](PyBlockArgument &self) { - return PyBlock(self.getParentOperation(), - mlirBlockArgumentGetOwner(self.get())); - }); - c.def_prop_ro("arg_number", [](PyBlockArgument &self) { - return mlirBlockArgumentGetArgNumber(self.get()); - }); + c.def_prop_ro( + "owner", + [](PyBlockArgument &self) { + return PyBlock(self.getParentOperation(), + mlirBlockArgumentGetOwner(self.get())); + }, + "Returns the block that owns this argument."); + c.def_prop_ro( + "arg_number", + [](PyBlockArgument &self) { + return mlirBlockArgumentGetArgNumber(self.get()); + }, + "Returns the position of this argument in the block's argument list."); c.def( "set_type", [](PyBlockArgument &self, PyType type) { return mlirBlockArgumentSetType(self.get(), type); }, - nb::arg("type")); + nb::arg("type"), "Sets the type of this block argument."); } }; @@ -2462,9 +2364,12 @@ class PyBlockArgumentList operation(std::move(operation)), block(block) {} static void bindDerived(ClassTy &c) { - c.def_prop_ro("types", [](PyBlockArgumentList &self) { - return getValueTypes(self, self.operation->getContext()); - }); + c.def_prop_ro( + "types", + [](PyBlockArgumentList &self) { + return getValueTypes(self, self.operation->getContext()); + }, + "Returns a list of types for all arguments in this argument list."); } private: @@ -2516,7 +2421,9 @@ class PyOpOperandList : public Sliceable { } static void bindDerived(ClassTy &c) { - c.def("__setitem__", &PyOpOperandList::dunderSetItem); + c.def("__setitem__", &PyOpOperandList::dunderSetItem, nb::arg("index"), + nb::arg("value"), + "Sets the operand at the specified index to a new value."); } private: @@ -2571,7 +2478,8 @@ class PyOpSuccessors : public Sliceable { } static void bindDerived(ClassTy &c) { - c.def("__setitem__", &PyOpSuccessors::dunderSetItem); + c.def("__setitem__", &PyOpSuccessors::dunderSetItem, nb::arg("index"), + nb::arg("block"), "Sets the successor block at the specified index."); } private: @@ -2743,55 +2651,70 @@ class PyOpAttributeMap { static void bind(nb::module_ &m) { nb::class_(m, "OpAttributeMap") - .def("__contains__", &PyOpAttributeMap::dunderContains) - .def("__len__", &PyOpAttributeMap::dunderLen) - .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed) - .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed) - .def("__setitem__", &PyOpAttributeMap::dunderSetItem) - .def("__delitem__", &PyOpAttributeMap::dunderDelItem) - .def("__iter__", - [](PyOpAttributeMap &self) { - nb::list keys; - PyOpAttributeMap::forEachAttr( - self.operation->get(), - [&](MlirStringRef name, MlirAttribute) { - keys.append(nb::str(name.data, name.length)); - }); - return nb::iter(keys); - }) - .def("keys", - [](PyOpAttributeMap &self) { - nb::list out; - PyOpAttributeMap::forEachAttr( - self.operation->get(), - [&](MlirStringRef name, MlirAttribute) { - out.append(nb::str(name.data, name.length)); - }); - return out; - }) - .def("values", - [](PyOpAttributeMap &self) { - nb::list out; - PyOpAttributeMap::forEachAttr( - self.operation->get(), - [&](MlirStringRef, MlirAttribute attr) { - out.append(PyAttribute(self.operation->getContext(), attr) - .maybeDownCast()); - }); - return out; - }) - .def("items", [](PyOpAttributeMap &self) { - nb::list out; - PyOpAttributeMap::forEachAttr( - self.operation->get(), - [&](MlirStringRef name, MlirAttribute attr) { - out.append(nb::make_tuple( - nb::str(name.data, name.length), - PyAttribute(self.operation->getContext(), attr) - .maybeDownCast())); - }); - return out; - }); + .def("__contains__", &PyOpAttributeMap::dunderContains, nb::arg("name"), + "Checks if an attribute with the given name exists in the map.") + .def("__len__", &PyOpAttributeMap::dunderLen, + "Returns the number of attributes in the map.") + .def("__getitem__", &PyOpAttributeMap::dunderGetItemNamed, + nb::arg("name"), "Gets an attribute by name.") + .def("__getitem__", &PyOpAttributeMap::dunderGetItemIndexed, + nb::arg("index"), "Gets a named attribute by index.") + .def("__setitem__", &PyOpAttributeMap::dunderSetItem, nb::arg("name"), + nb::arg("attr"), "Sets an attribute with the given name.") + .def("__delitem__", &PyOpAttributeMap::dunderDelItem, nb::arg("name"), + "Deletes an attribute with the given name.") + .def( + "__iter__", + [](PyOpAttributeMap &self) { + nb::list keys; + PyOpAttributeMap::forEachAttr( + self.operation->get(), + [&](MlirStringRef name, MlirAttribute) { + keys.append(nb::str(name.data, name.length)); + }); + return nb::iter(keys); + }, + "Iterates over attribute names.") + .def( + "keys", + [](PyOpAttributeMap &self) { + nb::list out; + PyOpAttributeMap::forEachAttr( + self.operation->get(), + [&](MlirStringRef name, MlirAttribute) { + out.append(nb::str(name.data, name.length)); + }); + return out; + }, + "Returns a list of attribute names.") + .def( + "values", + [](PyOpAttributeMap &self) { + nb::list out; + PyOpAttributeMap::forEachAttr( + self.operation->get(), + [&](MlirStringRef, MlirAttribute attr) { + out.append(PyAttribute(self.operation->getContext(), attr) + .maybeDownCast()); + }); + return out; + }, + "Returns a list of attribute values.") + .def( + "items", + [](PyOpAttributeMap &self) { + nb::list out; + PyOpAttributeMap::forEachAttr( + self.operation->get(), + [&](MlirStringRef name, MlirAttribute attr) { + out.append(nb::make_tuple( + nb::str(name.data, name.length), + PyAttribute(self.operation->getContext(), attr) + .maybeDownCast())); + }); + return out; + }, + "Returns a list of `(name, attribute)` tuples."); } private: @@ -2979,62 +2902,103 @@ void mlir::python::populateIRCore(nb::module_ &m) { // Mapping of Diagnostics. //---------------------------------------------------------------------------- nb::class_(m, "Diagnostic") - .def_prop_ro("severity", &PyDiagnostic::getSeverity) - .def_prop_ro("location", &PyDiagnostic::getLocation) - .def_prop_ro("message", &PyDiagnostic::getMessage) - .def_prop_ro("notes", &PyDiagnostic::getNotes) - .def("__str__", [](PyDiagnostic &self) -> nb::str { - if (!self.isValid()) - return nb::str(""); - return self.getMessage(); - }); + .def_prop_ro("severity", &PyDiagnostic::getSeverity, + "Returns the severity of the diagnostic.") + .def_prop_ro("location", &PyDiagnostic::getLocation, + "Returns the location associated with the diagnostic.") + .def_prop_ro("message", &PyDiagnostic::getMessage, + "Returns the message text of the diagnostic.") + .def_prop_ro("notes", &PyDiagnostic::getNotes, + "Returns a tuple of attached note diagnostics.") + .def( + "__str__", + [](PyDiagnostic &self) -> nb::str { + if (!self.isValid()) + return nb::str(""); + return self.getMessage(); + }, + "Returns the diagnostic message as a string."); nb::class_(m, "DiagnosticInfo") - .def("__init__", - [](PyDiagnostic::DiagnosticInfo &self, PyDiagnostic diag) { - new (&self) PyDiagnostic::DiagnosticInfo(diag.getInfo()); - }) - .def_ro("severity", &PyDiagnostic::DiagnosticInfo::severity) - .def_ro("location", &PyDiagnostic::DiagnosticInfo::location) - .def_ro("message", &PyDiagnostic::DiagnosticInfo::message) - .def_ro("notes", &PyDiagnostic::DiagnosticInfo::notes) - .def("__str__", - [](PyDiagnostic::DiagnosticInfo &self) { return self.message; }); + .def( + "__init__", + [](PyDiagnostic::DiagnosticInfo &self, PyDiagnostic diag) { + new (&self) PyDiagnostic::DiagnosticInfo(diag.getInfo()); + }, + "diag"_a, "Creates a DiagnosticInfo from a Diagnostic.") + .def_ro("severity", &PyDiagnostic::DiagnosticInfo::severity, + "The severity level of the diagnostic.") + .def_ro("location", &PyDiagnostic::DiagnosticInfo::location, + "The location associated with the diagnostic.") + .def_ro("message", &PyDiagnostic::DiagnosticInfo::message, + "The message text of the diagnostic.") + .def_ro("notes", &PyDiagnostic::DiagnosticInfo::notes, + "List of attached note diagnostics.") + .def( + "__str__", + [](PyDiagnostic::DiagnosticInfo &self) { return self.message; }, + "Returns the diagnostic message as a string."); nb::class_(m, "DiagnosticHandler") - .def("detach", &PyDiagnosticHandler::detach) - .def_prop_ro("attached", &PyDiagnosticHandler::isAttached) - .def_prop_ro("had_error", &PyDiagnosticHandler::getHadError) - .def("__enter__", &PyDiagnosticHandler::contextEnter) + .def("detach", &PyDiagnosticHandler::detach, + "Detaches the diagnostic handler from the context.") + .def_prop_ro("attached", &PyDiagnosticHandler::isAttached, + "Returns True if the handler is attached to a context.") + .def_prop_ro("had_error", &PyDiagnosticHandler::getHadError, + "Returns True if an error was encountered during diagnostic " + "handling.") + .def("__enter__", &PyDiagnosticHandler::contextEnter, + "Enters the diagnostic handler as a context manager.") .def("__exit__", &PyDiagnosticHandler::contextExit, nb::arg("exc_type").none(), nb::arg("exc_value").none(), - nb::arg("traceback").none()); + nb::arg("traceback").none(), + "Exits the diagnostic handler context manager."); // Expose DefaultThreadPool to python nb::class_(m, "ThreadPool") - .def("__init__", [](PyThreadPool &self) { new (&self) PyThreadPool(); }) - .def("get_max_concurrency", &PyThreadPool::getMaxConcurrency) - .def("_mlir_thread_pool_ptr", &PyThreadPool::_mlir_thread_pool_ptr); + .def( + "__init__", [](PyThreadPool &self) { new (&self) PyThreadPool(); }, + "Creates a new thread pool with default concurrency.") + .def("get_max_concurrency", &PyThreadPool::getMaxConcurrency, + "Returns the maximum number of threads in the pool.") + .def("_mlir_thread_pool_ptr", &PyThreadPool::_mlir_thread_pool_ptr, + "Returns the raw pointer to the LLVM thread pool as a string."); nb::class_(m, "Context") - .def("__init__", - [](PyMlirContext &self) { - MlirContext context = mlirContextCreateWithThreading(false); - new (&self) PyMlirContext(context); - }) - .def_static("_get_live_count", &PyMlirContext::getLiveCount) - .def("_get_context_again", - [](PyMlirContext &self) -> nb::typed { - PyMlirContextRef ref = PyMlirContext::forContext(self.get()); - return ref.releaseObject(); - }) - .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount) - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule) + .def( + "__init__", + [](PyMlirContext &self) { + MlirContext context = mlirContextCreateWithThreading(false); + new (&self) PyMlirContext(context); + }, + R"( + Creates a new MLIR context. + + The context is the top-level container for all MLIR objects. It owns the storage + for types, attributes, locations, and other core IR objects. A context can be + configured to allow or disallow unregistered dialects and can have dialects + loaded on-demand.)") + .def_static("_get_live_count", &PyMlirContext::getLiveCount, + "Gets the number of live Context objects.") + .def( + "_get_context_again", + [](PyMlirContext &self) -> nb::typed { + PyMlirContextRef ref = PyMlirContext::forContext(self.get()); + return ref.releaseObject(); + }, + "Gets another reference to the same context.") + .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount, + "Gets the number of live modules owned by this context.") + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule, + "Gets a capsule wrapping the MlirContext.") .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, - &PyMlirContext::createFromCapsule) - .def("__enter__", &PyMlirContext::contextEnter) + &PyMlirContext::createFromCapsule, + "Creates a Context from a capsule wrapping MlirContext.") + .def("__enter__", &PyMlirContext::contextEnter, + "Enters the context as a context manager.") .def("__exit__", &PyMlirContext::contextExit, nb::arg("exc_type").none(), - nb::arg("exc_value").none(), nb::arg("traceback").none()) + nb::arg("exc_value").none(), nb::arg("traceback").none(), + "Exits the context manager.") .def_prop_ro_static( "current", [](nb::object & /*class*/) @@ -3045,14 +3009,15 @@ void mlir::python::populateIRCore(nb::module_ &m) { return nb::cast(context); }, nb::sig("def current(/) -> Context | None"), - "Gets the Context bound to the current thread or raises ValueError") + "Gets the Context bound to the current thread or returns None if no " + "context is set.") .def_prop_ro( "dialects", [](PyMlirContext &self) { return PyDialects(self.getRef()); }, - "Gets a container for accessing dialects by name") + "Gets a container for accessing dialects by name.") .def_prop_ro( "d", [](PyMlirContext &self) { return PyDialects(self.getRef()); }, - "Alias for 'dialect'") + "Alias for `dialects`.") .def( "get_dialect_descriptor", [=](PyMlirContext &self, std::string &name) { @@ -3065,7 +3030,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { return PyDialectDescriptor(self.getRef(), dialect); }, nb::arg("dialect_name"), - "Gets or loads a dialect by name, returning its descriptor object") + "Gets or loads a dialect by name, returning its descriptor object.") .def_prop_rw( "allow_unregistered_dialects", [](PyMlirContext &self) -> bool { @@ -3073,67 +3038,110 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, [](PyMlirContext &self, bool value) { mlirContextSetAllowUnregisteredDialects(self.get(), value); - }) + }, + "Controls whether unregistered dialects are allowed in this context.") .def("attach_diagnostic_handler", &PyMlirContext::attachDiagnosticHandler, nb::arg("callback"), - "Attaches a diagnostic handler that will receive callbacks") + "Attaches a diagnostic handler that will receive callbacks.") .def( "enable_multithreading", [](PyMlirContext &self, bool enable) { mlirContextEnableMultithreading(self.get(), enable); }, - nb::arg("enable")) - .def("set_thread_pool", - [](PyMlirContext &self, PyThreadPool &pool) { - // we should disable multi-threading first before setting - // new thread pool otherwise the assert in - // MLIRContext::setThreadPool will be raised. - mlirContextEnableMultithreading(self.get(), false); - mlirContextSetThreadPool(self.get(), pool.get()); - }) - .def("get_num_threads", - [](PyMlirContext &self) { - return mlirContextGetNumThreads(self.get()); - }) - .def("_mlir_thread_pool_ptr", - [](PyMlirContext &self) { - MlirLlvmThreadPool pool = mlirContextGetThreadPool(self.get()); - std::stringstream ss; - ss << pool.ptr; - return ss.str(); - }) + nb::arg("enable"), + R"( + Enables or disables multi-threading support in the context. + + Args: + enable: Whether to enable (True) or disable (False) multi-threading. + )") + .def( + "set_thread_pool", + [](PyMlirContext &self, PyThreadPool &pool) { + // we should disable multi-threading first before setting + // new thread pool otherwise the assert in + // MLIRContext::setThreadPool will be raised. + mlirContextEnableMultithreading(self.get(), false); + mlirContextSetThreadPool(self.get(), pool.get()); + }, + R"( + Sets a custom thread pool for the context to use. + + Args: + pool: A ThreadPool object to use for parallel operations. + + Note: + Multi-threading is automatically disabled before setting the thread pool.)") + .def( + "get_num_threads", + [](PyMlirContext &self) { + return mlirContextGetNumThreads(self.get()); + }, + "Gets the number of threads in the context's thread pool.") + .def( + "_mlir_thread_pool_ptr", + [](PyMlirContext &self) { + MlirLlvmThreadPool pool = mlirContextGetThreadPool(self.get()); + std::stringstream ss; + ss << pool.ptr; + return ss.str(); + }, + "Gets the raw pointer to the LLVM thread pool as a string.") .def( "is_registered_operation", [](PyMlirContext &self, std::string &name) { return mlirContextIsRegisteredOperation( self.get(), MlirStringRef{name.data(), name.size()}); }, - nb::arg("operation_name")) + nb::arg("operation_name"), + R"( + Checks whether an operation with the given name is registered. + + Args: + operation_name: The fully qualified name of the operation (e.g., `arith.addf`). + + Returns: + True if the operation is registered, False otherwise.)") .def( "append_dialect_registry", [](PyMlirContext &self, PyDialectRegistry ®istry) { mlirContextAppendDialectRegistry(self.get(), registry); }, - nb::arg("registry")) + nb::arg("registry"), + R"( + Appends the contents of a dialect registry to the context. + + Args: + registry: A DialectRegistry containing dialects to append.)") .def_prop_rw("emit_error_diagnostics", &PyMlirContext::getEmitErrorDiagnostics, &PyMlirContext::setEmitErrorDiagnostics, - "Emit error diagnostics to diagnostic handlers. By default " - "error diagnostics are captured and reported through " - "MLIRError exceptions.") - .def("load_all_available_dialects", [](PyMlirContext &self) { - mlirContextLoadAllAvailableDialects(self.get()); - }); + R"( + Controls whether error diagnostics are emitted to diagnostic handlers. + + By default, error diagnostics are captured and reported through MLIRError exceptions.)") + .def( + "load_all_available_dialects", + [](PyMlirContext &self) { + mlirContextLoadAllAvailableDialects(self.get()); + }, + R"( + Loads all dialects available in the registry into the context. + + This eagerly loads all dialects that have been registered, making them + immediately available for use.)"); //---------------------------------------------------------------------------- // Mapping of PyDialectDescriptor //---------------------------------------------------------------------------- nb::class_(m, "DialectDescriptor") - .def_prop_ro("namespace", - [](PyDialectDescriptor &self) { - MlirStringRef ns = mlirDialectGetNamespace(self.get()); - return nb::str(ns.data, ns.length); - }) + .def_prop_ro( + "namespace", + [](PyDialectDescriptor &self) { + MlirStringRef ns = mlirDialectGetNamespace(self.get()); + return nb::str(ns.data, ns.length); + }, + "Returns the namespace of the dialect.") .def( "__repr__", [](PyDialectDescriptor &self) { @@ -3143,35 +3151,43 @@ void mlir::python::populateIRCore(nb::module_ &m) { repr.append(">"); return repr; }, - nb::sig("def __repr__(self) -> str")); + nb::sig("def __repr__(self) -> str"), + "Returns a string representation of the dialect descriptor."); //---------------------------------------------------------------------------- // Mapping of PyDialects //---------------------------------------------------------------------------- nb::class_(m, "Dialects") - .def("__getitem__", - [=](PyDialects &self, std::string keyName) { - MlirDialect dialect = - self.getDialectForKey(keyName, /*attrError=*/false); - nb::object descriptor = - nb::cast(PyDialectDescriptor{self.getContext(), dialect}); - return createCustomDialectWrapper(keyName, std::move(descriptor)); - }) - .def("__getattr__", [=](PyDialects &self, std::string attrName) { - MlirDialect dialect = - self.getDialectForKey(attrName, /*attrError=*/true); - nb::object descriptor = - nb::cast(PyDialectDescriptor{self.getContext(), dialect}); - return createCustomDialectWrapper(attrName, std::move(descriptor)); - }); + .def( + "__getitem__", + [=](PyDialects &self, std::string keyName) { + MlirDialect dialect = + self.getDialectForKey(keyName, /*attrError=*/false); + nb::object descriptor = + nb::cast(PyDialectDescriptor{self.getContext(), dialect}); + return createCustomDialectWrapper(keyName, std::move(descriptor)); + }, + "Gets a dialect by name using subscript notation.") + .def( + "__getattr__", + [=](PyDialects &self, std::string attrName) { + MlirDialect dialect = + self.getDialectForKey(attrName, /*attrError=*/true); + nb::object descriptor = + nb::cast(PyDialectDescriptor{self.getContext(), dialect}); + return createCustomDialectWrapper(attrName, std::move(descriptor)); + }, + "Gets a dialect by name using attribute notation."); //---------------------------------------------------------------------------- // Mapping of PyDialect //---------------------------------------------------------------------------- nb::class_(m, "Dialect") - .def(nb::init(), nb::arg("descriptor")) - .def_prop_ro("descriptor", - [](PyDialect &self) { return self.getDescriptor(); }) + .def(nb::init(), nb::arg("descriptor"), + "Creates a Dialect from a DialectDescriptor.") + .def_prop_ro( + "descriptor", [](PyDialect &self) { return self.getDescriptor(); }, + "Returns the DialectDescriptor for this dialect.") .def( "__repr__", [](const nb::object &self) { @@ -3181,31 +3197,43 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::str(" (class ") + clazz.attr("__module__") + nb::str(".") + clazz.attr("__name__") + nb::str(")>"); }, - nb::sig("def __repr__(self) -> str")); + nb::sig("def __repr__(self) -> str"), + "Returns a string representation of the dialect."); //---------------------------------------------------------------------------- // Mapping of PyDialectRegistry //---------------------------------------------------------------------------- nb::class_(m, "DialectRegistry") - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyDialectRegistry::getCapsule) + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyDialectRegistry::getCapsule, + "Gets a capsule wrapping the MlirDialectRegistry.") .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, - &PyDialectRegistry::createFromCapsule) - .def(nb::init<>()); + &PyDialectRegistry::createFromCapsule, + "Creates a DialectRegistry from a capsule wrapping " + "`MlirDialectRegistry`.") + .def(nb::init<>(), "Creates a new empty dialect registry."); //---------------------------------------------------------------------------- // Mapping of Location //---------------------------------------------------------------------------- nb::class_(m, "Location") - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule) - .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule) - .def("__enter__", &PyLocation::contextEnter) + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule, + "Gets a capsule wrapping the MlirLocation.") + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule, + "Creates a Location from a capsule wrapping MlirLocation.") + .def("__enter__", &PyLocation::contextEnter, + "Enters the location as a context manager.") .def("__exit__", &PyLocation::contextExit, nb::arg("exc_type").none(), - nb::arg("exc_value").none(), nb::arg("traceback").none()) - .def("__eq__", - [](PyLocation &self, PyLocation &other) -> bool { - return mlirLocationEqual(self, other); - }) - .def("__eq__", [](PyLocation &self, nb::object other) { return false; }) + nb::arg("exc_value").none(), nb::arg("traceback").none(), + "Exits the location context manager.") + .def( + "__eq__", + [](PyLocation &self, PyLocation &other) -> bool { + return mlirLocationEqual(self, other); + }, + "Compares two locations for equality.") + .def( + "__eq__", [](PyLocation &self, nb::object other) { return false; }, + "Compares location with non-location object (always returns False).") .def_prop_ro_static( "current", [](nb::object & /*class*/) -> std::optional { @@ -3217,7 +3245,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { // clang-format off nb::sig("def current(/) -> Location | None"), // clang-format on - "Gets the Location bound to the current thread or raises ValueError") + "Gets the Location bound to the current thread or raises ValueError.") .def_static( "unknown", [](DefaultingPyMlirContext context) { @@ -3225,13 +3253,13 @@ void mlir::python::populateIRCore(nb::module_ &m) { mlirLocationUnknownGet(context->get())); }, nb::arg("context") = nb::none(), - "Gets a Location representing an unknown location") + "Gets a Location representing an unknown location.") .def_static( "callsite", [](PyLocation callee, const std::vector &frames, DefaultingPyMlirContext context) { if (frames.empty()) - throw nb::value_error("No caller frames provided"); + throw nb::value_error("No caller frames provided."); MlirLocation caller = frames.back().get(); for (const PyLocation &frame : llvm::reverse(llvm::ArrayRef(frames).drop_back())) @@ -3240,18 +3268,23 @@ void mlir::python::populateIRCore(nb::module_ &m) { mlirLocationCallSiteGet(callee.get(), caller)); }, nb::arg("callee"), nb::arg("frames"), nb::arg("context") = nb::none(), - kContextGetCallSiteLocationDocstring) - .def("is_a_callsite", mlirLocationIsACallSite) - .def_prop_ro("callee", - [](PyLocation &self) { - return PyLocation(self.getContext(), - mlirLocationCallSiteGetCallee(self)); - }) - .def_prop_ro("caller", - [](PyLocation &self) { - return PyLocation(self.getContext(), - mlirLocationCallSiteGetCaller(self)); - }) + "Gets a Location representing a caller and callsite.") + .def("is_a_callsite", mlirLocationIsACallSite, + "Returns True if this location is a CallSiteLoc.") + .def_prop_ro( + "callee", + [](PyLocation &self) { + return PyLocation(self.getContext(), + mlirLocationCallSiteGetCallee(self)); + }, + "Gets the callee location from a CallSiteLoc.") + .def_prop_ro( + "caller", + [](PyLocation &self) { + return PyLocation(self.getContext(), + mlirLocationCallSiteGetCaller(self)); + }, + "Gets the caller location from a CallSiteLoc.") .def_static( "file", [](std::string filename, int line, int col, @@ -3262,7 +3295,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { context->get(), toMlirStringRef(filename), line, col)); }, nb::arg("filename"), nb::arg("line"), nb::arg("col"), - nb::arg("context") = nb::none(), kContextGetFileLocationDocstring) + nb::arg("context") = nb::none(), + "Gets a Location representing a file, line and column.") .def_static( "file", [](std::string filename, int startLine, int startCol, int endLine, @@ -3274,17 +3308,25 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, nb::arg("filename"), nb::arg("start_line"), nb::arg("start_col"), nb::arg("end_line"), nb::arg("end_col"), - nb::arg("context") = nb::none(), kContextGetFileRangeDocstring) - .def("is_a_file", mlirLocationIsAFileLineColRange) - .def_prop_ro("filename", - [](MlirLocation loc) { - return mlirIdentifierStr( - mlirLocationFileLineColRangeGetFilename(loc)); - }) - .def_prop_ro("start_line", mlirLocationFileLineColRangeGetStartLine) - .def_prop_ro("start_col", mlirLocationFileLineColRangeGetStartColumn) - .def_prop_ro("end_line", mlirLocationFileLineColRangeGetEndLine) - .def_prop_ro("end_col", mlirLocationFileLineColRangeGetEndColumn) + nb::arg("context") = nb::none(), + "Gets a Location representing a file, line and column range.") + .def("is_a_file", mlirLocationIsAFileLineColRange, + "Returns True if this location is a FileLineColLoc.") + .def_prop_ro( + "filename", + [](MlirLocation loc) { + return mlirIdentifierStr( + mlirLocationFileLineColRangeGetFilename(loc)); + }, + "Gets the filename from a FileLineColLoc.") + .def_prop_ro("start_line", mlirLocationFileLineColRangeGetStartLine, + "Gets the start line number from a `FileLineColLoc`.") + .def_prop_ro("start_col", mlirLocationFileLineColRangeGetStartColumn, + "Gets the start column number from a `FileLineColLoc`.") + .def_prop_ro("end_line", mlirLocationFileLineColRangeGetEndLine, + "Gets the end line number from a `FileLineColLoc`.") + .def_prop_ro("end_col", mlirLocationFileLineColRangeGetEndColumn, + "Gets the end column number from a `FileLineColLoc`.") .def_static( "fused", [](const std::vector &pyLocations, @@ -3300,8 +3342,11 @@ void mlir::python::populateIRCore(nb::module_ &m) { return PyLocation(context->getRef(), location); }, nb::arg("locations"), nb::arg("metadata") = nb::none(), - nb::arg("context") = nb::none(), kContextGetFusedLocationDocstring) - .def("is_a_fused", mlirLocationIsAFused) + nb::arg("context") = nb::none(), + "Gets a Location representing a fused location with optional " + "metadata.") + .def("is_a_fused", mlirLocationIsAFused, + "Returns True if this location is a `FusedLoc`.") .def_prop_ro( "locations", [](PyLocation &self) { @@ -3314,7 +3359,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { for (unsigned i = 0; i < numLocations; ++i) pyLocations.emplace_back(self.getContext(), locations[i]); return pyLocations; - }) + }, + "Gets the list of locations from a `FusedLoc`.") .def_static( "name", [](std::string name, std::optional childLoc, @@ -3327,17 +3373,24 @@ void mlir::python::populateIRCore(nb::module_ &m) { : mlirLocationUnknownGet(context->get()))); }, nb::arg("name"), nb::arg("childLoc") = nb::none(), - nb::arg("context") = nb::none(), kContextGetNameLocationDocString) - .def("is_a_name", mlirLocationIsAName) - .def_prop_ro("name_str", - [](MlirLocation loc) { - return mlirIdentifierStr(mlirLocationNameGetName(loc)); - }) - .def_prop_ro("child_loc", - [](PyLocation &self) { - return PyLocation(self.getContext(), - mlirLocationNameGetChildLoc(self)); - }) + nb::arg("context") = nb::none(), + "Gets a Location representing a named location with optional child " + "location.") + .def("is_a_name", mlirLocationIsAName, + "Returns True if this location is a `NameLoc`.") + .def_prop_ro( + "name_str", + [](MlirLocation loc) { + return mlirIdentifierStr(mlirLocationNameGetName(loc)); + }, + "Gets the name string from a `NameLoc`.") + .def_prop_ro( + "child_loc", + [](PyLocation &self) { + return PyLocation(self.getContext(), + mlirLocationNameGetChildLoc(self)); + }, + "Gets the child location from a `NameLoc`.") .def_static( "from_attr", [](PyAttribute &attribute, DefaultingPyMlirContext context) { @@ -3345,41 +3398,59 @@ void mlir::python::populateIRCore(nb::module_ &m) { mlirLocationFromAttribute(attribute)); }, nb::arg("attribute"), nb::arg("context") = nb::none(), - "Gets a Location from a LocationAttr") + "Gets a Location from a `LocationAttr`.") .def_prop_ro( "context", [](PyLocation &self) -> nb::typed { return self.getContext().getObject(); }, - "Context that owns the Location") + "Context that owns the `Location`.") .def_prop_ro( "attr", [](PyLocation &self) { return PyAttribute(self.getContext(), mlirLocationGetAttribute(self)); }, - "Get the underlying LocationAttr") + "Get the underlying `LocationAttr`.") .def( "emit_error", [](PyLocation &self, std::string message) { mlirEmitError(self, message.c_str()); }, - nb::arg("message"), "Emits an error at this location") - .def("__repr__", [](PyLocation &self) { - PyPrintAccumulator printAccum; - mlirLocationPrint(self, printAccum.getCallback(), - printAccum.getUserData()); - return printAccum.join(); - }); + nb::arg("message"), + R"( + Emits an error diagnostic at this location. + + Args: + message: The error message to emit.)") + .def( + "__repr__", + [](PyLocation &self) { + PyPrintAccumulator printAccum; + mlirLocationPrint(self, printAccum.getCallback(), + printAccum.getUserData()); + return printAccum.join(); + }, + "Returns the assembly representation of the location."); //---------------------------------------------------------------------------- // Mapping of Module //---------------------------------------------------------------------------- nb::class_(m, "Module", nb::is_weak_referenceable()) - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyModule::getCapsule) + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyModule::getCapsule, + "Gets a capsule wrapping the MlirModule.") .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyModule::createFromCapsule, - kModuleCAPICreate) - .def("_clear_mlir_module", &PyModule::clearMlirModule) + R"( + Creates a Module from a `MlirModule` wrapped by a capsule (i.e. `module._CAPIPtr`). + + This returns a new object **BUT** `_clear_mlir_module(module)` must be called to + prevent double-frees (of the underlying `mlir::Module`).)") + .def("_clear_mlir_module", &PyModule::clearMlirModule, + R"( + Clears the internal MLIR module reference. + + This is used internally to prevent double-free when ownership is transferred + via the C API capsule mechanism. Not intended for normal use.)") .def_static( "parse", [](const std::string &moduleAsm, DefaultingPyMlirContext context) @@ -3427,13 +3498,13 @@ void mlir::python::populateIRCore(nb::module_ &m) { MlirModule module = mlirModuleCreateEmpty(pyLoc.get()); return PyModule::forModule(module).releaseObject(); }, - nb::arg("loc") = nb::none(), "Creates an empty module") + nb::arg("loc") = nb::none(), "Creates an empty module.") .def_prop_ro( "context", [](PyModule &self) -> nb::typed { return self.getContext().getObject(); }, - "Context that created the Module") + "Context that created the `Module`.") .def_prop_ro( "operation", [](PyModule &self) -> nb::typed { @@ -3442,7 +3513,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { self.getRef().releaseObject()) .releaseObject(); }, - "Accesses the module as an operation") + "Accesses the module as an operation.") .def_prop_ro( "body", [](PyModule &self) { @@ -3452,7 +3523,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { PyBlock returnBlock(moduleOp, mlirModuleGetBody(self.get())); return returnBlock; }, - "Return the block for this module") + "Return the block for this module.") .def( "dump", [](PyModule &self) { @@ -3465,39 +3536,59 @@ void mlir::python::populateIRCore(nb::module_ &m) { // Defer to the operation's __str__. return self.attr("operation").attr("__str__")(); }, - nb::sig("def __str__(self) -> str"), kOperationStrDunderDocstring) + nb::sig("def __str__(self) -> str"), + R"( + Gets the assembly form of the operation with default options. + + If more advanced control over the assembly formatting or I/O options is needed, + use the dedicated print or get_asm method, which supports keyword arguments to + customize behavior. + )") .def( "__eq__", [](PyModule &self, PyModule &other) { return mlirModuleEqual(self.get(), other.get()); }, - "other"_a) - .def("__hash__", - [](PyModule &self) { return mlirModuleHashValue(self.get()); }); + "other"_a, "Compares two modules for equality.") + .def( + "__hash__", + [](PyModule &self) { return mlirModuleHashValue(self.get()); }, + "Returns the hash value of the module."); //---------------------------------------------------------------------------- // Mapping of Operation. //---------------------------------------------------------------------------- nb::class_(m, "_OperationBase") - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, - [](PyOperationBase &self) { - return self.getOperation().getCapsule(); - }) - .def("__eq__", - [](PyOperationBase &self, PyOperationBase &other) { - return mlirOperationEqual(self.getOperation().get(), - other.getOperation().get()); - }) - .def("__eq__", - [](PyOperationBase &self, nb::object other) { return false; }) - .def("__hash__", - [](PyOperationBase &self) { - return mlirOperationHashValue(self.getOperation().get()); - }) - .def_prop_ro("attributes", - [](PyOperationBase &self) { - return PyOpAttributeMap(self.getOperation().getRef()); - }) + .def_prop_ro( + MLIR_PYTHON_CAPI_PTR_ATTR, + [](PyOperationBase &self) { + return self.getOperation().getCapsule(); + }, + "Gets a capsule wrapping the `MlirOperation`.") + .def( + "__eq__", + [](PyOperationBase &self, PyOperationBase &other) { + return mlirOperationEqual(self.getOperation().get(), + other.getOperation().get()); + }, + "Compares two operations for equality.") + .def( + "__eq__", + [](PyOperationBase &self, nb::object other) { return false; }, + "Compares operation with non-operation object (always returns " + "False).") + .def( + "__hash__", + [](PyOperationBase &self) { + return mlirOperationHashValue(self.getOperation().get()); + }, + "Returns the hash value of the operation.") + .def_prop_ro( + "attributes", + [](PyOperationBase &self) { + return PyOpAttributeMap(self.getOperation().getRef()); + }, + "Returns a dictionary-like map of operation attributes.") .def_prop_ro( "context", [](PyOperationBase &self) -> nb::typed { @@ -3505,22 +3596,28 @@ void mlir::python::populateIRCore(nb::module_ &m) { concreteOperation.checkValid(); return concreteOperation.getContext().getObject(); }, - "Context that owns the Operation") - .def_prop_ro("name", - [](PyOperationBase &self) { - auto &concreteOperation = self.getOperation(); - concreteOperation.checkValid(); - MlirOperation operation = concreteOperation.get(); - return mlirIdentifierStr(mlirOperationGetName(operation)); - }) - .def_prop_ro("operands", - [](PyOperationBase &self) { - return PyOpOperandList(self.getOperation().getRef()); - }) - .def_prop_ro("regions", - [](PyOperationBase &self) { - return PyRegionList(self.getOperation().getRef()); - }) + "Context that owns the operation.") + .def_prop_ro( + "name", + [](PyOperationBase &self) { + auto &concreteOperation = self.getOperation(); + concreteOperation.checkValid(); + MlirOperation operation = concreteOperation.get(); + return mlirIdentifierStr(mlirOperationGetName(operation)); + }, + "Returns the fully qualified name of the operation.") + .def_prop_ro( + "operands", + [](PyOperationBase &self) { + return PyOpOperandList(self.getOperation().getRef()); + }, + "Returns the list of operation operands.") + .def_prop_ro( + "regions", + [](PyOperationBase &self) { + return PyRegionList(self.getOperation().getRef()); + }, + "Returns the list of operation regions.") .def_prop_ro( "results", [](PyOperationBase &self) { @@ -3551,14 +3648,16 @@ void mlir::python::populateIRCore(nb::module_ &m) { "defined or derived from."), nb::for_setter("Sets the source location the operation was defined " "or derived from.")) - .def_prop_ro("parent", - [](PyOperationBase &self) - -> std::optional> { - auto parent = self.getOperation().getParentOperation(); - if (parent) - return parent->getObject(); - return {}; - }) + .def_prop_ro( + "parent", + [](PyOperationBase &self) + -> std::optional> { + auto parent = self.getOperation().getParentOperation(); + if (parent) + return parent->getObject(); + return {}; + }, + "Returns the parent operation, or `None` if at top level.") .def( "__str__", [](PyOperationBase &self) { @@ -3579,7 +3678,14 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::overload_cast( &PyOperationBase::print), nb::arg("state"), nb::arg("file") = nb::none(), - nb::arg("binary") = false, kOperationPrintStateDocstring) + nb::arg("binary") = false, + R"( + Prints the assembly form of the operation to a file like object. + + Args: + state: `AsmState` capturing the operation numbering and flags. + file: Optional file like object to write to. Defaults to sys.stdout. + binary: Whether to write `bytes` (True) or `str` (False). Defaults to False.)") .def("print", nb::overload_cast, std::optional, bool, bool, bool, bool, bool, bool, nb::object, @@ -3594,10 +3700,47 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("use_name_loc_as_prefix") = false, nb::arg("assume_verified") = false, nb::arg("file") = nb::none(), nb::arg("binary") = false, nb::arg("skip_regions") = false, - kOperationPrintDocstring) + R"( + Prints the assembly form of the operation to a file like object. + + Args: + large_elements_limit: Whether to elide elements attributes above this + number of elements. Defaults to None (no limit). + large_resource_limit: Whether to elide resource attributes above this + number of characters. Defaults to None (no limit). If large_elements_limit + is set and this is None, the behavior will be to use large_elements_limit + as large_resource_limit. + enable_debug_info: Whether to print debug/location information. Defaults + to False. + pretty_debug_info: Whether to format debug information for easier reading + by a human (warning: the result is unparseable). Defaults to False. + print_generic_op_form: Whether to print the generic assembly forms of all + ops. Defaults to False. + use_local_scope: Whether to print in a way that is more optimized for + multi-threaded access but may not be consistent with how the overall + module prints. + use_name_loc_as_prefix: Whether to use location attributes (NameLoc) as + prefixes for the SSA identifiers. Defaults to False. + assume_verified: By default, if not printing generic form, the verifier + will be run and if it fails, generic form will be printed with a comment + about failed verification. While a reasonable default for interactive use, + for systematic use, it is often better for the caller to verify explicitly + and report failures in a more robust fashion. Set this to True if doing this + in order to avoid running a redundant verification. If the IR is actually + invalid, behavior is undefined. + file: The file like object to write to. Defaults to sys.stdout. + binary: Whether to write bytes (True) or str (False). Defaults to False. + skip_regions: Whether to skip printing regions. Defaults to False.)") .def("write_bytecode", &PyOperationBase::writeBytecode, nb::arg("file"), nb::arg("desired_version") = nb::none(), - kOperationPrintBytecodeDocstring) + R"( + Write the bytecode form of the operation to a file like object. + + Args: + file: The file like object to write to. + desired_version: Optional version of bytecode to emit. + Returns: + The bytecode writer status.)") .def("get_asm", &PyOperationBase::getAsm, // Careful: Lots of arguments must match up with get_asm method. nb::arg("binary") = false, @@ -3609,7 +3752,17 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("use_local_scope") = false, nb::arg("use_name_loc_as_prefix") = false, nb::arg("assume_verified") = false, nb::arg("skip_regions") = false, - kOperationGetAsmDocstring) + R"( + Gets the assembly form of the operation with all options available. + + Args: + binary: Whether to return a bytes (True) or str (False) object. Defaults to + False. + ... others ...: See the print() method for common keyword arguments for + configuring the printout. + Returns: + Either a bytes or str object, depending on the setting of the `binary` + argument.)") .def("verify", &PyOperationBase::verify, "Verify the operation. Raises MLIRError if verification fails, and " "returns true otherwise.") @@ -3621,18 +3774,31 @@ void mlir::python::populateIRCore(nb::module_ &m) { "block.") .def("is_before_in_block", &PyOperationBase::isBeforeInBlock, nb::arg("other"), - "Given an operation 'other' that is within the same parent block, " - "return" - "whether the current operation is before 'other' in the operation " - "list" - "of the parent block.") + R"( + Checks if this operation is before another in the same block. + + Args: + other: Another operation in the same parent block. + + Returns: + True if this operation is before `other` in the operation list of the parent block.)") .def( "clone", [](PyOperationBase &self, const nb::object &ip) -> nb::typed { return self.getOperation().clone(ip); }, - nb::arg("ip") = nb::none()) + nb::arg("ip") = nb::none(), + R"( + Creates a deep copy of the operation. + + Args: + ip: Optional insertion point where the cloned operation should be inserted. + If None, the current insertion point is used. If False, the operation + remains detached. + + Returns: + A new Operation that is a clone of this operation.)") .def( "detach_from_parent", [](PyOperationBase &self) -> nb::typed { @@ -3653,13 +3819,24 @@ void mlir::python::populateIRCore(nb::module_ &m) { return operation.isAttached(); }, "Reports if the operation is attached to its parent block.") - .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); }) + .def( + "erase", [](PyOperationBase &self) { self.getOperation().erase(); }, + R"( + Erases the operation and frees its memory. + + Note: + After erasing, any Python references to the operation become invalid.)") .def("walk", &PyOperationBase::walk, nb::arg("callback"), nb::arg("walk_order") = MlirWalkPostOrder, // clang-format off - nb::sig("def walk(self, callback: Callable[[Operation], WalkResult], walk_order: WalkOrder) -> None") + nb::sig("def walk(self, callback: Callable[[Operation], WalkResult], walk_order: WalkOrder) -> None"), // clang-format on - ); + R"( + Walks the operation tree with a callback function. + + Args: + callback: A callable that takes an Operation and returns a WalkResult. + walk_order: The order of traversal (PRE_ORDER or POST_ORDER).)"); nb::class_(m, "Operation") .def_static( @@ -3692,7 +3869,22 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("operands") = nb::none(), nb::arg("attributes") = nb::none(), nb::arg("successors") = nb::none(), nb::arg("regions") = 0, nb::arg("loc") = nb::none(), nb::arg("ip") = nb::none(), - nb::arg("infer_type") = false, kOperationCreateDocstring) + nb::arg("infer_type") = false, + R"( + Creates a new operation. + + Args: + name: Operation name (e.g. `dialect.operation`). + results: Optional sequence of Type representing op result types. + operands: Optional operands of the operation. + attributes: Optional Dict of {str: Attribute}. + successors: Optional List of Block for the operation's successors. + regions: Number of regions to create (default = 0). + location: Optional Location object (defaults to resolve from context manager). + ip: Optional InsertionPoint (defaults to resolve from context manager or set to False to disable insertion, even with an insertion point set in the context manager). + infer_type: Whether to infer result types (default = False). + Returns: + A new detached Operation object. Detached operations can be added to blocks, which causes them to become attached.)") .def_static( "parse", [](const std::string &sourceStr, const std::string &sourceName, @@ -3705,18 +3897,30 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("context") = nb::none(), "Parses an operation. Supports both text assembly format and binary " "bytecode format.") - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyOperation::getCapsule) + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyOperation::getCapsule, + "Gets a capsule wrapping the MlirOperation.") .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, - &PyOperation::createFromCapsule) - .def_prop_ro("operation", - [](nb::object self) -> nb::typed { - return self; - }) - .def_prop_ro("opview", - [](PyOperation &self) -> nb::typed { - return self.createOpView(); - }) - .def_prop_ro("block", &PyOperation::getBlock) + &PyOperation::createFromCapsule, + "Creates an Operation from a capsule wrapping MlirOperation.") + .def_prop_ro( + "operation", + [](nb::object self) -> nb::typed { + return self; + }, + "Returns self (the operation).") + .def_prop_ro( + "opview", + [](PyOperation &self) -> nb::typed { + return self.createOpView(); + }, + R"( + Returns an OpView of this operation. + + Note: + If the operation has a registered and loaded dialect then this OpView will + be concrete wrapper class.)") + .def_prop_ro("block", &PyOperation::getBlock, + "Returns the block containing this operation.") .def_prop_ro( "successors", [](PyOperationBase &self) { @@ -3830,7 +4034,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, nb::arg("cls"), nb::arg("source"), nb::kw_only(), nb::arg("source_name") = "", nb::arg("context") = nb::none(), - "Parses a specific, generated OpView based on class level attributes"); + "Parses a specific, generated OpView based on class level attributes."); //---------------------------------------------------------------------------- // Mapping of PyRegion. @@ -3856,17 +4060,22 @@ void mlir::python::populateIRCore(nb::module_ &m) { return PyBlockIterator(self.getParentOperation(), firstBlock); }, "Iterates over blocks in the region.") - .def("__eq__", - [](PyRegion &self, PyRegion &other) { - return self.get().ptr == other.get().ptr; - }) - .def("__eq__", [](PyRegion &self, nb::object &other) { return false; }); + .def( + "__eq__", + [](PyRegion &self, PyRegion &other) { + return self.get().ptr == other.get().ptr; + }, + "Compares two regions for pointer equality.") + .def( + "__eq__", [](PyRegion &self, nb::object &other) { return false; }, + "Compares region with non-region object (always returns False)."); //---------------------------------------------------------------------------- // Mapping of PyBlock. //---------------------------------------------------------------------------- nb::class_(m, "Block") - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyBlock::getCapsule) + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyBlock::getCapsule, + "Gets a capsule wrapping the MlirBlock.") .def_prop_ro( "owner", [](PyBlock &self) -> nb::typed { @@ -3893,14 +4102,26 @@ void mlir::python::populateIRCore(nb::module_ &m) { mlirBlockAddArgument(self.get(), type, loc)); }, "type"_a, "loc"_a, - "Append an argument of the specified type to the block and returns " - "the newly added argument.") + R"( + Appends an argument of the specified type to the block. + + Args: + type: The type of the argument to add. + loc: The source location for the argument. + + Returns: + The newly added block argument.)") .def( "erase_argument", [](PyBlock &self, unsigned index) { return mlirBlockEraseArgument(self.get(), index); }, - "Erase the argument at 'index' and remove it from the argument list.") + nb::arg("index"), + R"( + Erases the argument at the specified index. + + Args: + index: The index of the argument to erase.)") .def_prop_ro( "operations", [](PyBlock &self) { @@ -3928,7 +4149,14 @@ void mlir::python::populateIRCore(nb::module_ &m) { mlirBlockDetach(b); mlirRegionAppendOwnedBlock(region.get(), b); }, - "Append this block to a region, transferring ownership if necessary") + nb::arg("region"), + R"( + Appends this block to a region. + + Transfers ownership if the block is currently owned by another region. + + Args: + region: The region to append the block to.)") .def( "create_before", [](PyBlock &self, const nb::args &pyArgTypes, @@ -3969,15 +4197,21 @@ void mlir::python::populateIRCore(nb::module_ &m) { firstOperation); }, "Iterates over operations in the block.") - .def("__eq__", - [](PyBlock &self, PyBlock &other) { - return self.get().ptr == other.get().ptr; - }) - .def("__eq__", [](PyBlock &self, nb::object &other) { return false; }) - .def("__hash__", - [](PyBlock &self) { - return static_cast(llvm::hash_value(self.get().ptr)); - }) + .def( + "__eq__", + [](PyBlock &self, PyBlock &other) { + return self.get().ptr == other.get().ptr; + }, + "Compares two blocks for pointer equality.") + .def( + "__eq__", [](PyBlock &self, nb::object &other) { return false; }, + "Compares block with non-block object (always returns False).") + .def( + "__hash__", + [](PyBlock &self) { + return static_cast(llvm::hash_value(self.get().ptr)); + }, + "Returns the hash value of the block.") .def( "__str__", [](PyBlock &self) { @@ -4000,8 +4234,13 @@ void mlir::python::populateIRCore(nb::module_ &m) { self.getParentOperation().getObject()); }, nb::arg("operation"), - "Appends an operation to this block. If the operation is currently " - "in another block, it will be moved.") + R"( + Appends an operation to this block. + + If the operation is currently in another block, it will be moved. + + Args: + operation: The operation to append to the block.)") .def_prop_ro( "successors", [](PyBlock &self) { @@ -4022,10 +4261,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::class_(m, "InsertionPoint") .def(nb::init(), nb::arg("block"), "Inserts after the last operation but still inside the block.") - .def("__enter__", &PyInsertionPoint::contextEnter) + .def("__enter__", &PyInsertionPoint::contextEnter, + "Enters the insertion point as a context manager.") .def("__exit__", &PyInsertionPoint::contextExit, nb::arg("exc_type").none(), nb::arg("exc_value").none(), - nb::arg("traceback").none()) + nb::arg("traceback").none(), + "Exits the insertion point context manager.") .def_prop_ro_static( "current", [](nb::object & /*class*/) { @@ -4036,20 +4277,50 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, nb::sig("def current(/) -> InsertionPoint"), "Gets the InsertionPoint bound to the current thread or raises " - "ValueError if none has been set") + "ValueError if none has been set.") .def(nb::init(), nb::arg("beforeOperation"), "Inserts before a referenced operation.") .def_static("at_block_begin", &PyInsertionPoint::atBlockBegin, - nb::arg("block"), "Inserts at the beginning of the block.") + nb::arg("block"), + R"( + Creates an insertion point at the beginning of a block. + + Args: + block: The block at whose beginning operations should be inserted. + + Returns: + An InsertionPoint at the block's beginning.)") .def_static("at_block_terminator", &PyInsertionPoint::atBlockTerminator, - nb::arg("block"), "Inserts before the block terminator.") + nb::arg("block"), + R"( + Creates an insertion point before a block's terminator. + + Args: + block: The block whose terminator to insert before. + + Returns: + An InsertionPoint before the terminator. + + Raises: + ValueError: If the block has no terminator.)") .def_static("after", &PyInsertionPoint::after, nb::arg("operation"), - "Inserts after the operation.") + R"( + Creates an insertion point immediately after an operation. + + Args: + operation: The operation after which to insert. + + Returns: + An InsertionPoint after the operation.)") .def("insert", &PyInsertionPoint::insert, nb::arg("operation"), - "Inserts an operation.") + R"( + Inserts an operation at this insertion point. + + Args: + operation: The operation to insert.)") .def_prop_ro( "block", [](PyInsertionPoint &self) { return self.getBlock(); }, - "Returns the block that this InsertionPoint points to.") + "Returns the block that this `InsertionPoint` points to.") .def_prop_ro( "ref_operation", [](PyInsertionPoint &self) @@ -4061,7 +4332,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, "The reference operation before which new operations are " "inserted, or None if the insertion point is at the end of " - "the block"); + "the block."); //---------------------------------------------------------------------------- // Mapping of PyAttribute. @@ -4070,10 +4341,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { // Delegate to the PyAttribute copy constructor, which will also lifetime // extend the backing context which owns the MlirAttribute. .def(nb::init(), nb::arg("cast_from_type"), - "Casts the passed attribute to the generic Attribute") - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyAttribute::getCapsule) - .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, - &PyAttribute::createFromCapsule) + "Casts the passed attribute to the generic `Attribute`.") + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyAttribute::getCapsule, + "Gets a capsule wrapping the MlirAttribute.") + .def_static( + MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyAttribute::createFromCapsule, + "Creates an Attribute from a capsule wrapping `MlirAttribute`.") .def_static( "parse", [](const std::string &attrSpec, DefaultingPyMlirContext context) @@ -4086,33 +4359,49 @@ void mlir::python::populateIRCore(nb::module_ &m) { return PyAttribute(context.get()->getRef(), attr).maybeDownCast(); }, nb::arg("asm"), nb::arg("context") = nb::none(), - "Parses an attribute from an assembly form. Raises an MLIRError on " + "Parses an attribute from an assembly form. Raises an `MLIRError` on " "failure.") .def_prop_ro( "context", [](PyAttribute &self) -> nb::typed { return self.getContext().getObject(); }, - "Context that owns the Attribute") - .def_prop_ro("type", - [](PyAttribute &self) -> nb::typed { - return PyType(self.getContext(), - mlirAttributeGetType(self)) - .maybeDownCast(); - }) + "Context that owns the `Attribute`.") + .def_prop_ro( + "type", + [](PyAttribute &self) -> nb::typed { + return PyType(self.getContext(), mlirAttributeGetType(self)) + .maybeDownCast(); + }, + "Returns the type of the `Attribute`.") .def( "get_named", [](PyAttribute &self, std::string name) { return PyNamedAttribute(self, std::move(name)); }, - nb::keep_alive<0, 1>(), "Binds a name to the attribute") - .def("__eq__", - [](PyAttribute &self, PyAttribute &other) { return self == other; }) - .def("__eq__", [](PyAttribute &self, nb::object &other) { return false; }) - .def("__hash__", - [](PyAttribute &self) { - return static_cast(llvm::hash_value(self.get().ptr)); - }) + nb::keep_alive<0, 1>(), + R"( + Binds a name to the attribute, creating a `NamedAttribute`. + + Args: + name: The name to bind to the `Attribute`. + + Returns: + A `NamedAttribute` with the given name and this attribute.)") + .def( + "__eq__", + [](PyAttribute &self, PyAttribute &other) { return self == other; }, + "Compares two attributes for equality.") + .def( + "__eq__", [](PyAttribute &self, nb::object &other) { return false; }, + "Compares attribute with non-attribute object (always returns " + "False).") + .def( + "__hash__", + [](PyAttribute &self) { + return static_cast(llvm::hash_value(self.get().ptr)); + }, + "Returns the hash value of the attribute.") .def( "dump", [](PyAttribute &self) { mlirAttributeDump(self); }, kDumpDocstring) @@ -4125,61 +4414,69 @@ void mlir::python::populateIRCore(nb::module_ &m) { return printAccum.join(); }, "Returns the assembly form of the Attribute.") - .def("__repr__", - [](PyAttribute &self) { - // Generally, assembly formats are not printed for __repr__ because - // this can cause exceptionally long debug output and exceptions. - // However, attribute values are generally considered useful and - // are printed. This may need to be re-evaluated if debug dumps end - // up being excessive. - PyPrintAccumulator printAccum; - printAccum.parts.append("Attribute("); - mlirAttributePrint(self, printAccum.getCallback(), - printAccum.getUserData()); - printAccum.parts.append(")"); - return printAccum.join(); - }) - .def_prop_ro("typeid", - [](PyAttribute &self) { - MlirTypeID mlirTypeID = mlirAttributeGetTypeID(self); - assert(!mlirTypeIDIsNull(mlirTypeID) && - "mlirTypeID was expected to be non-null."); - return PyTypeID(mlirTypeID); - }) - .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, - [](PyAttribute &self) -> nb::typed { - return self.maybeDownCast(); - }); + .def( + "__repr__", + [](PyAttribute &self) { + // Generally, assembly formats are not printed for __repr__ because + // this can cause exceptionally long debug output and exceptions. + // However, attribute values are generally considered useful and + // are printed. This may need to be re-evaluated if debug dumps end + // up being excessive. + PyPrintAccumulator printAccum; + printAccum.parts.append("Attribute("); + mlirAttributePrint(self, printAccum.getCallback(), + printAccum.getUserData()); + printAccum.parts.append(")"); + return printAccum.join(); + }, + "Returns a string representation of the attribute.") + .def_prop_ro( + "typeid", + [](PyAttribute &self) { + MlirTypeID mlirTypeID = mlirAttributeGetTypeID(self); + assert(!mlirTypeIDIsNull(mlirTypeID) && + "mlirTypeID was expected to be non-null."); + return PyTypeID(mlirTypeID); + }, + "Returns the `TypeID` of the attribute.") + .def( + MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, + [](PyAttribute &self) -> nb::typed { + return self.maybeDownCast(); + }, + "Downcasts the attribute to a more specific attribute if possible."); //---------------------------------------------------------------------------- // Mapping of PyNamedAttribute //---------------------------------------------------------------------------- nb::class_(m, "NamedAttribute") - .def("__repr__", - [](PyNamedAttribute &self) { - PyPrintAccumulator printAccum; - printAccum.parts.append("NamedAttribute("); - printAccum.parts.append( - nb::str(mlirIdentifierStr(self.namedAttr.name).data, - mlirIdentifierStr(self.namedAttr.name).length)); - printAccum.parts.append("="); - mlirAttributePrint(self.namedAttr.attribute, - printAccum.getCallback(), - printAccum.getUserData()); - printAccum.parts.append(")"); - return printAccum.join(); - }) + .def( + "__repr__", + [](PyNamedAttribute &self) { + PyPrintAccumulator printAccum; + printAccum.parts.append("NamedAttribute("); + printAccum.parts.append( + nb::str(mlirIdentifierStr(self.namedAttr.name).data, + mlirIdentifierStr(self.namedAttr.name).length)); + printAccum.parts.append("="); + mlirAttributePrint(self.namedAttr.attribute, + printAccum.getCallback(), + printAccum.getUserData()); + printAccum.parts.append(")"); + return printAccum.join(); + }, + "Returns a string representation of the named attribute.") .def_prop_ro( "name", [](PyNamedAttribute &self) { return mlirIdentifierStr(self.namedAttr.name); }, - "The name of the NamedAttribute binding") + "The name of the `NamedAttribute` binding.") .def_prop_ro( "attr", [](PyNamedAttribute &self) { return self.namedAttr.attribute; }, nb::keep_alive<0, 1>(), nb::sig("def attr(self) -> Attribute"), - "The underlying generic attribute of the NamedAttribute binding"); + "The underlying generic attribute of the `NamedAttribute` binding."); //---------------------------------------------------------------------------- // Mapping of PyType. @@ -4188,9 +4485,11 @@ void mlir::python::populateIRCore(nb::module_ &m) { // Delegate to the PyType copy constructor, which will also lifetime // extend the backing context which owns the MlirType. .def(nb::init(), nb::arg("cast_from_type"), - "Casts the passed type to the generic Type") - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyType::getCapsule) - .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyType::createFromCapsule) + "Casts the passed type to the generic `Type`.") + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyType::getCapsule, + "Gets a capsule wrapping the `MlirType`.") + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyType::createFromCapsule, + "Creates a Type from a capsule wrapping `MlirType`.") .def_static( "parse", [](std::string typeSpec, @@ -4203,21 +4502,31 @@ void mlir::python::populateIRCore(nb::module_ &m) { return PyType(context.get()->getRef(), type).maybeDownCast(); }, nb::arg("asm"), nb::arg("context") = nb::none(), - kContextParseTypeDocstring) + R"( + Parses the assembly form of a type. + + Returns a Type object or raises an `MLIRError` if the type cannot be parsed. + + See also: https://mlir.llvm.org/docs/LangRef/#type-system)") .def_prop_ro( "context", [](PyType &self) -> nb::typed { return self.getContext().getObject(); }, - "Context that owns the Type") - .def("__eq__", [](PyType &self, PyType &other) { return self == other; }) + "Context that owns the `Type`.") + .def( + "__eq__", [](PyType &self, PyType &other) { return self == other; }, + "Compares two types for equality.") .def( "__eq__", [](PyType &self, nb::object &other) { return false; }, - nb::arg("other").none()) - .def("__hash__", - [](PyType &self) { - return static_cast(llvm::hash_value(self.get().ptr)); - }) + nb::arg("other").none(), + "Compares type with non-type object (always returns False).") + .def( + "__hash__", + [](PyType &self) { + return static_cast(llvm::hash_value(self.get().ptr)); + }, + "Returns the hash value of the `Type`.") .def( "dump", [](PyType &self) { mlirTypeDump(self); }, kDumpDocstring) .def( @@ -4228,60 +4537,81 @@ void mlir::python::populateIRCore(nb::module_ &m) { printAccum.getUserData()); return printAccum.join(); }, - "Returns the assembly form of the type.") - .def("__repr__", - [](PyType &self) { - // Generally, assembly formats are not printed for __repr__ because - // this can cause exceptionally long debug output and exceptions. - // However, types are an exception as they typically have compact - // assembly forms and printing them is useful. - PyPrintAccumulator printAccum; - printAccum.parts.append("Type("); - mlirTypePrint(self, printAccum.getCallback(), - printAccum.getUserData()); - printAccum.parts.append(")"); - return printAccum.join(); - }) - .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, - [](PyType &self) -> nb::typed { - return self.maybeDownCast(); - }) - .def_prop_ro("typeid", [](PyType &self) { - MlirTypeID mlirTypeID = mlirTypeGetTypeID(self); - if (!mlirTypeIDIsNull(mlirTypeID)) - return PyTypeID(mlirTypeID); - auto origRepr = nb::cast(nb::repr(nb::cast(self))); - throw nb::value_error( - (origRepr + llvm::Twine(" has no typeid.")).str().c_str()); - }); + "Returns the assembly form of the `Type`.") + .def( + "__repr__", + [](PyType &self) { + // Generally, assembly formats are not printed for __repr__ because + // this can cause exceptionally long debug output and exceptions. + // However, types are an exception as they typically have compact + // assembly forms and printing them is useful. + PyPrintAccumulator printAccum; + printAccum.parts.append("Type("); + mlirTypePrint(self, printAccum.getCallback(), + printAccum.getUserData()); + printAccum.parts.append(")"); + return printAccum.join(); + }, + "Returns a string representation of the `Type`.") + .def( + MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, + [](PyType &self) -> nb::typed { + return self.maybeDownCast(); + }, + "Downcasts the Type to a more specific `Type` if possible.") + .def_prop_ro( + "typeid", + [](PyType &self) { + MlirTypeID mlirTypeID = mlirTypeGetTypeID(self); + if (!mlirTypeIDIsNull(mlirTypeID)) + return PyTypeID(mlirTypeID); + auto origRepr = nb::cast(nb::repr(nb::cast(self))); + throw nb::value_error( + (origRepr + llvm::Twine(" has no typeid.")).str().c_str()); + }, + "Returns the `TypeID` of the `Type`, or raises `ValueError` if " + "`Type` has no " + "`TypeID`."); //---------------------------------------------------------------------------- // Mapping of PyTypeID. //---------------------------------------------------------------------------- nb::class_(m, "TypeID") - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyTypeID::getCapsule) - .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyTypeID::createFromCapsule) + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyTypeID::getCapsule, + "Gets a capsule wrapping the `MlirTypeID`.") + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyTypeID::createFromCapsule, + "Creates a `TypeID` from a capsule wrapping `MlirTypeID`.") // Note, this tests whether the underlying TypeIDs are the same, // not whether the wrapper MlirTypeIDs are the same, nor whether // the Python objects are the same (i.e., PyTypeID is a value type). - .def("__eq__", - [](PyTypeID &self, PyTypeID &other) { return self == other; }) - .def("__eq__", - [](PyTypeID &self, const nb::object &other) { return false; }) + .def( + "__eq__", + [](PyTypeID &self, PyTypeID &other) { return self == other; }, + "Compares two `TypeID`s for equality.") + .def( + "__eq__", + [](PyTypeID &self, const nb::object &other) { return false; }, + "Compares TypeID with non-TypeID object (always returns False).") // Note, this gives the hash value of the underlying TypeID, not the // hash value of the Python object, nor the hash value of the // MlirTypeID wrapper. - .def("__hash__", [](PyTypeID &self) { - return static_cast(mlirTypeIDHashValue(self)); - }); + .def( + "__hash__", + [](PyTypeID &self) { + return static_cast(mlirTypeIDHashValue(self)); + }, + "Returns the hash value of the `TypeID`."); //---------------------------------------------------------------------------- // Mapping of Value. //---------------------------------------------------------------------------- nb::class_(m, "Value") - .def(nb::init(), nb::keep_alive<0, 1>(), nb::arg("value")) - .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyValue::getCapsule) - .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyValue::createFromCapsule) + .def(nb::init(), nb::keep_alive<0, 1>(), nb::arg("value"), + "Creates a Value reference from another `Value`.") + .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyValue::getCapsule, + "Gets a capsule wrapping the `MlirValue`.") + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyValue::createFromCapsule, + "Creates a `Value` from a capsule wrapping `MlirValue`.") .def_prop_ro( "context", [](PyValue &self) -> nb::typed { @@ -4312,23 +4642,30 @@ void mlir::python::populateIRCore(nb::module_ &m) { assert(false && "Value must be a block argument or an op result"); return nb::none(); }, - // clang-format off - nb::sig("def owner(self) -> Operation | Block | None")) - // clang-format on - .def_prop_ro("uses", - [](PyValue &self) { - return PyOpOperandIterator( - mlirValueGetFirstUse(self.get())); - }) - .def("__eq__", - [](PyValue &self, PyValue &other) { - return self.get().ptr == other.get().ptr; - }) - .def("__eq__", [](PyValue &self, nb::object other) { return false; }) - .def("__hash__", - [](PyValue &self) { - return static_cast(llvm::hash_value(self.get().ptr)); - }) + "Returns the owner of the value (`Operation` for results, `Block` " + "for " + "arguments).") + .def_prop_ro( + "uses", + [](PyValue &self) { + return PyOpOperandIterator(mlirValueGetFirstUse(self.get())); + }, + "Returns an iterator over uses of this value.") + .def( + "__eq__", + [](PyValue &self, PyValue &other) { + return self.get().ptr == other.get().ptr; + }, + "Compares two values for pointer equality.") + .def( + "__eq__", [](PyValue &self, nb::object other) { return false; }, + "Compares value with non-value object (always returns False).") + .def( + "__hash__", + [](PyValue &self) { + return static_cast(llvm::hash_value(self.get().ptr)); + }, + "Returns the hash value of the value.") .def( "__str__", [](PyValue &self) { @@ -4339,7 +4676,13 @@ void mlir::python::populateIRCore(nb::module_ &m) { printAccum.parts.append(")"); return printAccum.join(); }, - kValueDunderStrDocstring) + R"( + Returns the string form of the value. + + If the value is a block argument, this is the assembly form of its type and the + position in the argument list. If the value is an operation result, this is + equivalent to printing the operation that produced it. + )") .def( "get_name", [](PyValue &self, bool useLocalScope, bool useNameLocAsPrefix) { @@ -4359,7 +4702,16 @@ void mlir::python::populateIRCore(nb::module_ &m) { return printAccum.join(); }, nb::arg("use_local_scope") = false, - nb::arg("use_name_loc_as_prefix") = false) + nb::arg("use_name_loc_as_prefix") = false, + R"( + Returns the string form of value as an operand. + + Args: + use_local_scope: Whether to use local scope for naming. + use_name_loc_as_prefix: Whether to use the location attribute (NameLoc) as prefix. + + Returns: + The value's name as it appears in IR (e.g., `%0`, `%arg0`).)") .def( "get_name", [](PyValue &self, PyAsmState &state) { @@ -4370,25 +4722,29 @@ void mlir::python::populateIRCore(nb::module_ &m) { printAccum.getUserData()); return printAccum.join(); }, - nb::arg("state"), kGetNameAsOperand) - .def_prop_ro("type", - [](PyValue &self) -> nb::typed { - return PyType(self.getParentOperation()->getContext(), - mlirValueGetType(self.get())) - .maybeDownCast(); - }) + nb::arg("state"), + "Returns the string form of value as an operand (i.e., the ValueID).") + .def_prop_ro( + "type", + [](PyValue &self) -> nb::typed { + return PyType(self.getParentOperation()->getContext(), + mlirValueGetType(self.get())) + .maybeDownCast(); + }, + "Returns the type of the value.") .def( "set_type", [](PyValue &self, const PyType &type) { - return mlirValueSetType(self.get(), type); + mlirValueSetType(self.get(), type); }, - nb::arg("type")) + nb::arg("type"), "Sets the type of the value.") .def( "replace_all_uses_with", [](PyValue &self, PyValue &with) { mlirValueReplaceAllUsesOfWith(self.get(), with.get()); }, - kValueReplaceAllUsesWithDocstring) + "Replace all uses of value with the new value, updating anything in " + "the IR that uses `self` to use the other value instead.") .def( "replace_all_uses_except", [](PyValue &self, PyValue &with, PyOperation &exception) { @@ -4434,10 +4790,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, nb::arg("with_"), nb::arg("exceptions"), kValueReplaceAllUsesExceptDocstring) - .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, - [](PyValue &self) -> nb::typed { - return self.maybeDownCast(); - }) + .def( + MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, + [](PyValue &self) -> nb::typed { + return self.maybeDownCast(); + }, + "Downcasts the `Value` to a more specific kind if possible.") .def_prop_ro( "location", [](MlirValue self) { @@ -4445,7 +4803,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { PyMlirContext::forContext(mlirValueGetContext(self)), mlirValueGetLocation(self)); }, - "Returns the source location the value"); + "Returns the source location of the value."); PyBlockArgument::bind(m); PyOpResult::bind(m); @@ -4453,43 +4811,105 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::class_(m, "AsmState") .def(nb::init(), nb::arg("value"), - nb::arg("use_local_scope") = false) + nb::arg("use_local_scope") = false, + R"( + Creates an `AsmState` for consistent SSA value naming. + + Args: + value: The value to create state for. + use_local_scope: Whether to use local scope for naming.)") .def(nb::init(), nb::arg("op"), - nb::arg("use_local_scope") = false); + nb::arg("use_local_scope") = false, + R"( + Creates an AsmState for consistent SSA value naming. + + Args: + op: The operation to create state for. + use_local_scope: Whether to use local scope for naming.)"); //---------------------------------------------------------------------------- // Mapping of SymbolTable. //---------------------------------------------------------------------------- nb::class_(m, "SymbolTable") - .def(nb::init()) - .def("__getitem__", - [](PySymbolTable &self, - const std::string &name) -> nb::typed { - return self.dunderGetItem(name); - }) - .def("insert", &PySymbolTable::insert, nb::arg("operation")) - .def("erase", &PySymbolTable::erase, nb::arg("operation")) - .def("__delitem__", &PySymbolTable::dunderDel) - .def("__contains__", - [](PySymbolTable &table, const std::string &name) { - return !mlirOperationIsNull(mlirSymbolTableLookup( - table, mlirStringRefCreate(name.data(), name.length()))); - }) + .def(nb::init(), + R"( + Creates a symbol table for an operation. + + Args: + operation: The `Operation` that defines a symbol table (e.g., a `ModuleOp`). + + Raises: + TypeError: If the operation is not a symbol table.)") + .def( + "__getitem__", + [](PySymbolTable &self, + const std::string &name) -> nb::typed { + return self.dunderGetItem(name); + }, + R"( + Looks up a symbol by name in the symbol table. + + Args: + name: The name of the symbol to look up. + + Returns: + The operation defining the symbol. + + Raises: + KeyError: If the symbol is not found.)") + .def("insert", &PySymbolTable::insert, nb::arg("operation"), + R"( + Inserts a symbol operation into the symbol table. + + Args: + operation: An operation with a symbol name to insert. + + Returns: + The symbol name attribute of the inserted operation. + + Raises: + ValueError: If the operation does not have a symbol name.)") + .def("erase", &PySymbolTable::erase, nb::arg("operation"), + R"( + Erases a symbol operation from the symbol table. + + Args: + operation: The symbol operation to erase. + + Note: + The operation is also erased from the IR and invalidated.)") + .def("__delitem__", &PySymbolTable::dunderDel, + "Deletes a symbol by name from the symbol table.") + .def( + "__contains__", + [](PySymbolTable &table, const std::string &name) { + return !mlirOperationIsNull(mlirSymbolTableLookup( + table, mlirStringRefCreate(name.data(), name.length()))); + }, + "Checks if a symbol with the given name exists in the table.") // Static helpers. .def_static("set_symbol_name", &PySymbolTable::setSymbolName, - nb::arg("symbol"), nb::arg("name")) + nb::arg("symbol"), nb::arg("name"), + "Sets the symbol name for a symbol operation.") .def_static("get_symbol_name", &PySymbolTable::getSymbolName, - nb::arg("symbol")) + nb::arg("symbol"), + "Gets the symbol name from a symbol operation.") .def_static("get_visibility", &PySymbolTable::getVisibility, - nb::arg("symbol")) + nb::arg("symbol"), + "Gets the visibility attribute of a symbol operation.") .def_static("set_visibility", &PySymbolTable::setVisibility, - nb::arg("symbol"), nb::arg("visibility")) + nb::arg("symbol"), nb::arg("visibility"), + "Sets the visibility attribute of a symbol operation.") .def_static("replace_all_symbol_uses", &PySymbolTable::replaceAllSymbolUses, nb::arg("old_symbol"), - nb::arg("new_symbol"), nb::arg("from_op")) + nb::arg("new_symbol"), nb::arg("from_op"), + "Replaces all uses of a symbol with a new symbol name within " + "the given operation.") .def_static("walk_symbol_tables", &PySymbolTable::walkSymbolTables, nb::arg("from_op"), nb::arg("all_sym_uses_visible"), - nb::arg("callback")); + nb::arg("callback"), + "Walks symbol tables starting from an operation with a " + "callback function."); // Container bindings. PyBlockArgumentList::bind(m); diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp index 1a1485ba2e02c..b097d3a0c9686 100644 --- a/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp +++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToXeVMPipeline.cpp @@ -63,13 +63,20 @@ void buildGPUPassPipeline(OpPassManager &pm, if (options.xegpuOpLevel == "workgroup") { pm.addNestedPass(xegpu::createXeGPUWgToSgDistribute()); pm.addNestedPass(createCSEPass()); + xegpu::XeGPUPropagateLayoutOptions layoutOptions; + layoutOptions.layoutKind = "inst"; + pm.addNestedPass( + xegpu::createXeGPUPropagateLayout(layoutOptions)); pm.addNestedPass(xegpu::createXeGPUBlocking()); pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); } if (options.xegpuOpLevel == "subgroup" || options.xegpuOpLevel == "workgroup") { - pm.addNestedPass(xegpu::createXeGPUPropagateLayout()); + xegpu::XeGPUPropagateLayoutOptions layoutOptions; + layoutOptions.layoutKind = "lane"; + pm.addNestedPass( + xegpu::createXeGPUPropagateLayout(layoutOptions)); pm.addNestedPass(xegpu::createXeGPUSubgroupDistribute()); pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(createCSEPass()); diff --git a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp index 8943ba09d9c34..5fdd8534e4e51 100644 --- a/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp +++ b/mlir/lib/Dialect/XeGPU/TransformOps/XeGPUTransformOps.cpp @@ -13,6 +13,9 @@ #include +#include "llvm/Support/DebugLog.h" +#define DEBUG_TYPE "xegpu-transforms" + using namespace mlir; using namespace mlir::transform; @@ -76,6 +79,45 @@ static DiagnosedSilenceableFailure convertMixedValuesToInt( return DiagnosedSilenceableFailure::success(); } +/// Find producer operation of type T for the given value. +/// It's assumed that producer ops are chained through their first operand. +/// Producer chain is traced trough loop block arguments (init values). +template +static std::optional findProducerOfType(Value val) { + Value currentValue = val; + if (!currentValue.getDefiningOp()) { + // Value may be a block argument initialized outside a loop. + if (val.getNumUses() == 0) { + LDBG() << "Failed to find producer op, value has no uses."; + return std::nullopt; + } + auto userOp = val.getUsers().begin(); + auto parentLoop = userOp->getParentOfType(); + if (!parentLoop) { + LDBG() << "Failed to find producer op, not in a loop."; + return std::nullopt; + } + int64_t iterArgIdx; + if (auto iterArg = llvm::dyn_cast(currentValue)) { + auto numInductionVars = parentLoop.getLoopInductionVars()->size(); + iterArgIdx = iterArg.getArgNumber() - numInductionVars; + currentValue = parentLoop.getInits()[iterArgIdx]; + } else { + LDBG() << "Failed to find producer op, value not in init values."; + return std::nullopt; + } + } + Operation *producerOp = currentValue.getDefiningOp(); + + if (auto matchingOp = dyn_cast(producerOp)) + return matchingOp; + + if (producerOp->getNumOperands() == 0) + return std::nullopt; + + return findProducerOfType(producerOp->getOperand(0)); +} + /// Create a layout attribute from the given parameters. static xegpu::LayoutAttr createLayoutAttr(MLIRContext *ctx, ArrayRef sgLayout, @@ -90,6 +132,36 @@ createLayoutAttr(MLIRContext *ctx, ArrayRef sgLayout, /*order=*/nullptr); } +/// Generate `xegpu::LayoutAttr` from op mixed layout values. +DiagnosedSilenceableFailure +getLayoutAttrFromOperands(MLIRContext *ctx, transform::TransformState &state, + TransformOpInterface transformOp, + ArrayRef<::mlir::OpFoldResult> mixedSgLayout, + ArrayRef<::mlir::OpFoldResult> mixedSgData, + ArrayRef<::mlir::OpFoldResult> mixedInstData, + xegpu::LayoutAttr &layoutAttr) { + SmallVector sgLayout, sgData, instData; + auto status = + convertMixedValuesToInt(state, transformOp, sgLayout, mixedSgLayout); + if (!status.succeeded()) + return status; + + status = convertMixedValuesToInt(state, transformOp, sgData, mixedSgData); + if (!status.succeeded()) + return status; + + status = convertMixedValuesToInt(state, transformOp, instData, mixedInstData); + if (!status.succeeded()) + return status; + auto maybeInstData = instData.empty() + ? std::nullopt + : std::optional>(instData); + + layoutAttr = createLayoutAttr(ctx, sgLayout, sgData, maybeInstData); + + return DiagnosedSilenceableFailure::success(); +} + /// Replace xegpu.create_nd_desc op with a new one with the given layout. static xegpu::CreateNdDescOp setDescLayout(transform::TransformRewriter &rewriter, @@ -111,6 +183,29 @@ setDescLayout(transform::TransformRewriter &rewriter, return newDescOp; } +DiagnosedSilenceableFailure +transform::GetDescOp::apply(transform::TransformRewriter &rewriter, + transform::TransformResults &results, + transform::TransformState &state) { + auto targetValues = state.getPayloadValues(getTarget()); + if (!llvm::hasSingleElement(targetValues)) { + return emitDefiniteFailure() + << "requires exactly one target value handle (got " + << llvm::range_size(targetValues) << ")"; + } + + auto maybeDescOp = + findProducerOfType(*targetValues.begin()); + if (!maybeDescOp) { + return emitSilenceableFailure(getLoc()) + << "Could not find a matching descriptor op when walking the " + "producer chain of the first operand."; + } + + results.set(llvm::cast(getResult()), {*maybeDescOp}); + return DiagnosedSilenceableFailure::success(); +} + void transform::SetDescLayoutOp::build(OpBuilder &builder, OperationState &result, Value target, ArrayRef mixedSgLayout, @@ -142,26 +237,13 @@ transform::SetDescLayoutOp::apply(transform::TransformRewriter &rewriter, } Operation *target = *targetOps.begin(); - SmallVector sgLayout; - DiagnosedSilenceableFailure status = - convertMixedValuesToInt(state, (*this), sgLayout, getMixedSgLayout()); - if (!status.succeeded()) - return status; - - SmallVector sgData; - status = convertMixedValuesToInt(state, (*this), sgData, getMixedSgData()); + xegpu::LayoutAttr layoutAttr = nullptr; + auto status = getLayoutAttrFromOperands(getContext(), state, (*this), + getMixedSgLayout(), getMixedSgData(), + getMixedInstData(), layoutAttr); if (!status.succeeded()) return status; - SmallVector instData; - status = - convertMixedValuesToInt(state, (*this), instData, getMixedInstData()); - if (!status.succeeded()) - return status; - auto maybeInstData = instData.empty() - ? std::nullopt - : std::optional>(instData); - // For now only create_nd_desc op is supported. auto descOp = dyn_cast(target); if (!descOp) { @@ -173,8 +255,6 @@ transform::SetDescLayoutOp::apply(transform::TransformRewriter &rewriter, } // Set layout attr in desc op's return type. Replaces old desc op. - auto layoutAttr = - createLayoutAttr(rewriter.getContext(), sgLayout, sgData, maybeInstData); auto newdescOp = setDescLayout(rewriter, descOp, layoutAttr); // Map result handles. @@ -193,6 +273,74 @@ void transform::SetDescLayoutOp::getEffects( modifiesPayload(effects); } +void transform::SetOpLayoutAttrOp::build( + OpBuilder &builder, OperationState &ostate, Value target, int64_t index, + ArrayRef mixedSgLayout, ArrayRef mixedSgData, + ArrayRef mixedInstData, bool result) { + SmallVector staticSgLayout, staticSgData, staticInstData; + SmallVector dynamicSgLayout, dynamicSgData, dynamicInstData; + dispatchIndexOpFoldResults(mixedSgLayout, dynamicSgLayout, staticSgLayout); + dispatchIndexOpFoldResults(mixedSgData, dynamicSgData, staticSgData); + dispatchIndexOpFoldResults(mixedInstData, dynamicInstData, staticInstData); + build(builder, ostate, target.getType(), + /*target=*/target, + /*index=*/index, + /*sg_layout=*/dynamicSgLayout, + /*sg_data=*/dynamicSgData, + /*inst_data=*/dynamicInstData, + /*static_sg_layout=*/staticSgLayout, + /*static_sg_data=*/staticSgData, + /*static_inst_data=*/staticInstData, + /*result=*/result); +} + +DiagnosedSilenceableFailure +transform::SetOpLayoutAttrOp::apply(transform::TransformRewriter &rewriter, + transform::TransformResults &results, + transform::TransformState &state) { + auto targetOps = state.getPayloadOps(getTarget()); + if (!llvm::hasSingleElement(targetOps)) { + return emitDefiniteFailure() << "Requires exactly one targetOp handle (got " + << llvm::range_size(targetOps) << ")"; + } + Operation *target = *targetOps.begin(); + + bool resultTarget = getResult(); + + int64_t index = getIndex(); + if (resultTarget && index >= target->getNumResults()) { + return emitSilenceableFailure(getLoc()) + << "Index exceeds the number of op results"; + } + if (!resultTarget && index >= target->getNumOperands()) { + return emitSilenceableFailure(getLoc()) + << "Index exceeds the number of op operands"; + } + + xegpu::LayoutAttr layoutAttr = nullptr; + auto status = getLayoutAttrFromOperands(getContext(), state, (*this), + getMixedSgLayout(), getMixedSgData(), + getMixedInstData(), layoutAttr); + if (!status.succeeded()) + return status; + + // Set layout attribute for the op result or operand + if (resultTarget) + xegpu::setDistributeLayoutAttr(target->getResult(index), layoutAttr); + else + xegpu::setDistributeLayoutAttr(target->getOpOperand(index), layoutAttr); + return DiagnosedSilenceableFailure::success(); +} + +void transform::SetOpLayoutAttrOp::getEffects( + ::llvm::SmallVectorImpl &effects) { + onlyReadsHandle(getTargetMutable(), effects); + onlyReadsHandle(getSgLayoutMutable(), effects); + onlyReadsHandle(getSgDataMutable(), effects); + onlyReadsHandle(getInstDataMutable(), effects); + modifiesPayload(effects); +} + namespace { class XeGPUTransformDialectExtension : public transform::TransformDialectExtension< diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 4e1a539771d2f..b3a780abd3f12 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -53,6 +53,8 @@ using namespace mlir::dataflow; namespace { +enum class LayoutKind { Lane, InstData }; + //===----------------------------------------------------------------------===// // LayoutInfo //===----------------------------------------------------------------------===// @@ -166,7 +168,8 @@ LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) { llvm_unreachable("Join should not be triggered by layout propagation."); } -/// Construct a new layout with the transposed lane layout and lane data. +/// Construct a new layout with the transposed inst_data or lane_layout, +/// lane_data. LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { if (!isAssigned()) return {}; @@ -186,12 +189,20 @@ LayoutInfo LayoutInfo::transpose(ArrayRef permutation) const { SmallVector laneData; SmallVector instData; for (int64_t idx : permutation) { - laneLayout.push_back(static_cast(getLaneLayout()[idx])); - laneData.push_back(static_cast(getLaneData()[idx])); - instData.push_back(static_cast(getInstData()[idx])); + if (getLaneLayout().size()) { + laneLayout.push_back(static_cast(getLaneLayout()[idx])); + laneData.push_back(static_cast(getLaneData()[idx])); + } + if (getInstData().size()) + instData.push_back(static_cast(getInstData()[idx])); } - return LayoutInfo(xegpu::LayoutAttr::get(storage.getContext(), instData, - laneLayout, laneData)); + xegpu::LayoutAttr layoutAttr; + if (getLaneLayout().size()) + layoutAttr = + xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData); + if (getInstData().size()) + layoutAttr = xegpu::LayoutAttr::get(storage.getContext(), instData); + return LayoutInfo(layoutAttr); } //===----------------------------------------------------------------------===// @@ -213,15 +224,14 @@ struct LayoutInfoLattice : public Lattice { /// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1]. static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, unsigned rank, - const xegpu::uArch::uArch *uArch, - ArrayRef instData) { + const xegpu::uArch::uArch *uArch) { assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector."); if (rank == 1) { return LayoutInfo( - xegpu::LayoutAttr::get(ctx, instData, {uArch->getSubgroupSize()}, {1})); + xegpu::LayoutAttr::get(ctx, {uArch->getSubgroupSize()}, {1})); } - return LayoutInfo(xegpu::LayoutAttr::get( - ctx, instData, {1, uArch->getSubgroupSize()}, {1, 1})); + return LayoutInfo( + xegpu::LayoutAttr::get(ctx, {1, uArch->getSubgroupSize()}, {1, 1})); } static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, @@ -236,7 +246,6 @@ static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx, /// Helper to get the default layout for a vector type. static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, const xegpu::uArch::uArch *uArch, - ArrayRef instData, unsigned packingSize, bool isScattered = false) { // Expecting a 1D or 2D vector. @@ -247,16 +256,16 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (vectorTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch, instData); + return getDefaultSIMTLayoutInfo(vectorTy.getContext(), 1, uArch); // Packing factor is determined by the element type bitwidth. unsigned bitwidth = vectorTy.getElementType().getIntOrFloatBitWidth(); int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1; if (isScattered) { - return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), {uArch->getSubgroupSize(), 1}, {1, packingFactor})); } - return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), instData, + return LayoutInfo(xegpu::LayoutAttr::get(vectorTy.getContext(), {1, uArch->getSubgroupSize()}, {1, packingFactor})); } @@ -264,7 +273,6 @@ static LayoutInfo getDefaultSIMTLayoutInfo(VectorType vectorTy, /// Helper to get the default layout for a vector type. static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, const xegpu::uArch::uArch *uArch, - ArrayRef instData, unsigned packingSize, bool isScattered = false) { // Expecting a 1D or 2D vector. @@ -275,18 +283,18 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, "Expected int or float element type."); // If the rank is 1, then return default layout for 1D vector. if (tdescTy.getRank() == 1) - return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch, instData); + return getDefaultSIMTLayoutInfo(tdescTy.getContext(), 1, uArch); // Packing factor is determined by the element type bitwidth. unsigned bitwidth = tdescTy.getElementType().getIntOrFloatBitWidth(); int subgroupSize = uArch->getSubgroupSize(); int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1; if (isScattered) { return LayoutInfo(xegpu::LayoutAttr::get( - tdescTy.getContext(), instData, {subgroupSize, 1}, {1, packingFactor})); + tdescTy.getContext(), {subgroupSize, 1}, {1, packingFactor})); } return LayoutInfo(xegpu::LayoutAttr::get( - tdescTy.getContext(), instData, {1, subgroupSize}, {1, packingFactor})); + tdescTy.getContext(), {1, subgroupSize}, {1, packingFactor})); } /// Helper Function to get the expected layouts for DPAS operands. `lane_data` @@ -298,7 +306,7 @@ static LayoutInfo getDefaultSIMTLayoutInfo(xegpu::TensorDescType tdescTy, static LayoutInfo getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum, const xegpu::uArch::uArch *uArch, - ArrayRef instData, unsigned packingSize) { + unsigned packingSize) { Type elementTy = vectorTy.getElementType(); assert(elementTy.isIntOrFloat() && "Expected int or float type in DPAS operands"); @@ -310,10 +318,10 @@ getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum, {static_cast(packingSize / elementTy.getIntOrFloatBitWidth()), 1}); return LayoutInfo( - xegpu::LayoutAttr::get(vectorTy.getContext(), instData, layout, data)); + xegpu::LayoutAttr::get(vectorTy.getContext(), layout, data)); } // Otherwise, return the default layout for the vector type. - return getDefaultSIMTLayoutInfo(vectorTy, uArch, instData, packingSize); + return getDefaultSIMTLayoutInfo(vectorTy, uArch, packingSize); } //===----------------------------------------------------------------------===// @@ -328,6 +336,7 @@ getSIMTLayoutInfoForDPASOperand(VectorType vectorTy, unsigned operandNum, class LayoutInfoPropagation : public SparseBackwardDataFlowAnalysis { private: + LayoutKind layoutKind; void visitDpasOp(xegpu::DpasOp dpas, ArrayRef operands, ArrayRef results); @@ -380,8 +389,10 @@ class LayoutInfoPropagation public: LayoutInfoPropagation(DataFlowSolver &solver, - SymbolTableCollection &symbolTable) - : SparseBackwardDataFlowAnalysis(solver, symbolTable) {} + SymbolTableCollection &symbolTable, + LayoutKind layoutKind) + : SparseBackwardDataFlowAnalysis(solver, symbolTable), + layoutKind(layoutKind) {} using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis; LogicalResult @@ -499,8 +510,14 @@ void LayoutInfoPropagation::visitPrefetchNdOp( "No suitable instruction multiple found for the given shape."); instData = {instHeight, instWidth}; } - auto prefetchLayout = getDefaultSIMTLayoutInfo( - tdescTy, uArch, instData, uArchInstruction->getPackedFormatBitSize()); + LayoutInfo prefetchLayout; + if (layoutKind == LayoutKind::InstData) + prefetchLayout = + LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData)); + else + prefetchLayout = getDefaultSIMTLayoutInfo( + tdescTy, uArch, uArchInstruction->getPackedFormatBitSize()); + // Propagate the layout to the source tensor descriptor. propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout)); } @@ -627,14 +644,24 @@ void LayoutInfoPropagation::visitDpasOp( SmallVector instDataA = {maxALen, subgroupSize}; SmallVector instDataB = {subgroupSize, maxBLen}; - propagateIfChanged(operands[0], - operands[0]->meet(getSIMTLayoutInfoForDPASOperand( - aTy, 0, uArch, instDataA, - uArchInstruction->getPackedFormatBitSizeA()))); - propagateIfChanged(operands[1], - operands[1]->meet(getSIMTLayoutInfoForDPASOperand( - bTy, 1, uArch, instDataB, - uArchInstruction->getPackedFormatBitSizeB()))); + LayoutInfo dpasALayout; + LayoutInfo dpasBLayout; + LayoutInfo dpasCLayout; + + if (layoutKind == LayoutKind::InstData) { + dpasALayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataA)); + dpasBLayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataB)); + } else { + dpasALayout = getSIMTLayoutInfoForDPASOperand( + aTy, 0, uArch, uArchInstruction->getPackedFormatBitSizeA()); + dpasBLayout = getSIMTLayoutInfoForDPASOperand( + bTy, 1, uArch, uArchInstruction->getPackedFormatBitSizeB()); + } + + propagateIfChanged(operands[0], operands[0]->meet(dpasALayout)); + propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout)); if (operands.size() > 2) { VectorType cTy = dpas.getAccType(); const unsigned dataCLen = bTy.getShape().back(); @@ -645,10 +672,15 @@ void LayoutInfoPropagation::visitDpasOp( dpas.emitWarning( "No suitable instruction multiple found for the given shape."); SmallVector instDataC = {maxALen, maxCLen}; - propagateIfChanged(operands[2], - operands[2]->meet(getSIMTLayoutInfoForDPASOperand( - cTy, 2, uArch, instDataC, - uArchInstruction->getPackedFormatBitSizeB()))); + + if (layoutKind == LayoutKind::InstData) + dpasCLayout = + LayoutInfo(xegpu::LayoutAttr::get(dpas.getContext(), instDataC)); + else + dpasCLayout = getSIMTLayoutInfoForDPASOperand( + cTy, 2, uArch, uArchInstruction->getPackedFormatBitSizeB()); + + propagateIfChanged(operands[2], operands[2]->meet(dpasCLayout)); } } @@ -685,9 +717,15 @@ void LayoutInfoPropagation::visitStoreNdOp( "No suitable instruction multiple found for the given shape."); instData = {instHeight, instWidth}; } - LayoutInfo storeLayout = - getDefaultSIMTLayoutInfo(store.getValueType(), uArch, instData, - uArchInstruction->getPackedFormatBitSize()); + + LayoutInfo storeLayout; + if (layoutKind == LayoutKind::InstData) + storeLayout = + LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData)); + else + storeLayout = + getDefaultSIMTLayoutInfo(store.getValueType(), uArch, + uArchInstruction->getPackedFormatBitSize()); // Both operands should have the same layout for (LayoutInfoLattice *operand : operands) propagateIfChanged(operand, operand->meet(storeLayout)); @@ -818,9 +856,13 @@ void LayoutInfoPropagation::visitLoadGatherOp( if (srcTdescTy.getChunkSizeAsInt() > 1) instData.push_back(chunkSize); } - LayoutInfo layout = getDefaultSIMTLayoutInfo( - payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), - /*scattered*/ true); + LayoutInfo layout; + if (layoutKind == LayoutKind::InstData) + layout = LayoutInfo(xegpu::LayoutAttr::get(load.getContext(), instData)); + else + layout = getDefaultSIMTLayoutInfo(payloadTy, uArch, + uArch->getGeneralPackedFormatBitSize(), + /*scattered*/ true); // Mask operand should have 1D default layout. LayoutInfo maskLayout = @@ -864,33 +906,36 @@ void LayoutInfoPropagation::visitStoreScatterOp( storeScatter.emitWarning("Not propagating, non-vector payload supplied."); return; } + LayoutInfo payloadLayout; auto uArch = getUArch(getChipStr(storeScatter).value_or("")); const int subgroupSize = uArch->getSubgroupSize(); - auto payloadShape = payloadTy.getShape(); - if (payloadShape.size() > 1) - assert( - payloadShape[0] == subgroupSize && - "Expected the first dimension of 2D tensor descriptor to be equal to " - "subgroup size."); - - SmallVector instData{subgroupSize}; - if (auto chunkSize = storeScatter.getChunkSize().value_or(0); chunkSize > 1) - instData.push_back(chunkSize); - else if (auto dstTdescTy = - dyn_cast(storeScatter.getDestType())) { - if (dstTdescTy.getChunkSizeAsInt() > 1) - instData.push_back(chunkSize); - } - - LayoutInfo payloadLayout; - if (auto layout = storeScatter.getLayoutAttr()) { payloadLayout = LayoutInfo(layout); } else { - payloadLayout = getDefaultSIMTLayoutInfo( - payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(), - /*scattered=*/true); + if (layoutKind == LayoutKind::InstData) { + SmallVector instData{subgroupSize}; + if (auto chunkSize = storeScatter.getChunkSize().value_or(0); + chunkSize > 1) + instData.push_back(chunkSize); + else if (auto dstTdescTy = dyn_cast( + storeScatter.getDestType())) { + if (dstTdescTy.getChunkSizeAsInt() > 1) + instData.push_back(chunkSize); + } + payloadLayout = LayoutInfo( + xegpu::LayoutAttr::get(storeScatter.getContext(), instData)); + } else { + auto payloadShape = payloadTy.getShape(); + if (payloadShape.size() > 1) + assert(payloadShape[0] == subgroupSize && + "Expected the first dimension of 2D tensor descriptor to be " + "equal to " + "subgroup size."); + payloadLayout = getDefaultSIMTLayoutInfo( + payloadTy, uArch, uArch->getGeneralPackedFormatBitSize(), + /*scattered=*/true); + } } LayoutInfo maskLayout = @@ -916,10 +961,10 @@ class RunLayoutInfoPropagation { public: MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation) - RunLayoutInfoPropagation(Operation *op) : target(op) { + RunLayoutInfoPropagation(Operation *op, LayoutKind layoutKind) : target(op) { SymbolTableCollection symbolTable; loadBaselineAnalyses(solver); - solver.load(symbolTable); + solver.load(symbolTable, layoutKind); (void)solver.initializeAndRun(op); } @@ -1159,7 +1204,18 @@ struct XeGPUPropagateLayoutPass final } // namespace void XeGPUPropagateLayoutPass::runOnOperation() { - auto &analysis = getAnalysis(); + LayoutKind layoutKind; + if (this->layoutKind == "lane") { + layoutKind = LayoutKind::Lane; + } else if (this->layoutKind == "inst") { + layoutKind = LayoutKind::InstData; + } else { + getOperation()->emitError("Unsupported layout kind option: " + + this->layoutKind); + signalPassFailure(); + return; + } + RunLayoutInfoPropagation analysis(getOperation(), layoutKind); // Print the analysis result and exit. (for debugging purposes) if (printOnly) { auto &os = llvm::outs(); @@ -1173,8 +1229,6 @@ void XeGPUPropagateLayoutPass::runOnOperation() { return {}; xegpu::DistributeLayoutAttr layoutAttr = cast(layout.get()); - if (this->layoutKind == "lane") - layoutAttr = layoutAttr.dropInstData(); if (layout.isSliceLayout()) return cast(layoutAttr); return cast(layoutAttr); diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp index 9ef405dad5a70..018a188d09109 100644 --- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp +++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp @@ -681,17 +681,8 @@ processBuffer(raw_ostream &os, std::unique_ptr ownedBuffer, return success(); } -std::pair -mlir::registerAndParseCLIOptions(int argc, char **argv, - llvm::StringRef toolName, - DialectRegistry ®istry) { - static cl::opt inputFilename( - cl::Positional, cl::desc(""), cl::init("-")); - - static cl::opt outputFilename("o", cl::desc("Output filename"), - cl::value_desc("filename"), - cl::init("-")); - // Register any command line options. +std::string mlir::registerCLIOptions(llvm::StringRef toolName, + DialectRegistry ®istry) { MlirOptMainConfig::registerCLOptions(registry); registerAsmPrinterCLOptions(); registerMLIRContextCLOptions(); @@ -706,11 +697,29 @@ mlir::registerAndParseCLIOptions(int argc, char **argv, interleaveComma(registry.getDialectNames(), os, [&](auto name) { os << name; }); } - // Parse pass names in main to ensure static initialization completed. + return helpHeader; +} + +std::pair +mlir::parseCLIOptions(int argc, char **argv, llvm::StringRef helpHeader) { + static cl::opt inputFilename( + cl::Positional, cl::desc(""), cl::init("-")); + + static cl::opt outputFilename("o", cl::desc("Output filename"), + cl::value_desc("filename"), + cl::init("-")); cl::ParseCommandLineOptions(argc, argv, helpHeader); return std::make_pair(inputFilename.getValue(), outputFilename.getValue()); } +std::pair +mlir::registerAndParseCLIOptions(int argc, char **argv, + llvm::StringRef toolName, + DialectRegistry ®istry) { + auto helpHeader = registerCLIOptions(toolName, registry); + return parseCLIOptions(argc, argv, helpHeader); +} + static LogicalResult printRegisteredDialects(DialectRegistry ®istry) { llvm::outs() << "Available Dialects: "; interleave(registry.getDialectNames(), llvm::outs(), ","); diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index f8c38fadbd229..9945a711d5c74 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -2856,17 +2856,19 @@ LogicalResult OperationLegalizer::legalizePatternResult( assert(impl.pendingRootUpdates.empty() && "dangling root updates"); #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS - // Check that the root was either replaced or updated in place. - auto newRewrites = llvm::drop_begin(impl.rewrites, curState.numRewrites); - auto replacedRoot = [&] { - return hasRewrite(newRewrites, op); - }; - auto updatedRootInPlace = [&] { - return hasRewrite(newRewrites, op); - }; - if (!replacedRoot() && !updatedRootInPlace()) - llvm::report_fatal_error( - "expected pattern to replace the root operation or modify it in place"); + if (impl.config.allowPatternRollback) { + // Check that the root was either replaced or updated in place. + auto newRewrites = llvm::drop_begin(impl.rewrites, curState.numRewrites); + auto replacedRoot = [&] { + return hasRewrite(newRewrites, op); + }; + auto updatedRootInPlace = [&] { + return hasRewrite(newRewrites, op); + }; + if (!replacedRoot() && !updatedRootInPlace()) + llvm::report_fatal_error("expected pattern to replace the root operation " + "or modify it in place"); + } #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS // Legalize each of the actions registered during application. diff --git a/mlir/python/mlir/dialects/transform/xegpu.py b/mlir/python/mlir/dialects/transform/xegpu.py index 2918bf592880a..ce8015d8f557b 100644 --- a/mlir/python/mlir/dialects/transform/xegpu.py +++ b/mlir/python/mlir/dialects/transform/xegpu.py @@ -7,6 +7,7 @@ try: from ...ir import * + from ...dialects import transform from .._ods_common import _cext as _ods_cext from .._ods_common import ( MixedValues, @@ -20,6 +21,26 @@ from typing import Union, Optional +@_ods_cext.register_operation(_Dialect, replace=True) +class GetDescOp(GetDescOp): + """Specialization for GetDescOp class.""" + + def __init__( + self, + target: Value, + *, + loc=None, + ip=None, + ): + desc_type = transform.AnyOpType.get() + super().__init__( + desc_type, + target, + loc=loc, + ip=ip, + ) + + @_ods_cext.register_operation(_Dialect, replace=True) class SetDescLayoutOp(SetDescLayoutOp): """Specialization for SetDescLayoutOp class.""" @@ -64,3 +85,50 @@ def __init__( loc=loc, ip=ip, ) + + +@_ods_cext.register_operation(_Dialect, replace=True) +class SetOpLayoutAttrOp(SetOpLayoutAttrOp): + """Specialization for SetOpLayoutAttrOp class.""" + + def __init__( + self, + target: Union[Operation, Value], + sg_layout: MixedValues, + sg_data: MixedValues, + *, + inst_data: Optional[MixedValues] = None, + index: Optional[Union[int, Attribute]] = None, + result: Optional[Union[bool, Attribute]] = None, + loc=None, + ip=None, + ): + inst_data = [] if inst_data is None else inst_data + ( + dynamic_sg_layout, + static_sg_layout, + _, + ) = _dispatch_dynamic_index_list(sg_layout) + ( + dynamic_sg_data, + static_sg_data, + _, + ) = _dispatch_dynamic_index_list(sg_data) + ( + dynamic_inst_data, + static_inst_data, + _, + ) = _dispatch_dynamic_index_list(inst_data) + super().__init__( + _get_op_result_or_value(target), + dynamic_sg_layout, + dynamic_sg_data, + dynamic_inst_data, + static_sg_layout=static_sg_layout, + static_sg_data=static_sg_data, + static_inst_data=static_inst_data, + index=index, + result=result, + loc=loc, + ip=ip, + ) diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir index 58461b8be52c4..c31ef323a94d2 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-inst-data.mlir @@ -2,17 +2,17 @@ // CHECK-LABEL: func.func @dpas_f16( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : -// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : -// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> -// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense<0.000000e+00> : vector<8x16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> +// CHECK: %[[T2:.*]] = xegpu.load_nd %[[T0]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T1]] {layout_result_0 = #xegpu.layout} : +// CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> +// CHECK: %[[T4:.*]] = xegpu.dpas %[[T2]], %[[T3]], %[[CST]] {layout_result_0 = #xegpu.layout} : // CHECK-SAME: vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> -// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> +// CHECK: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout +// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.module @test { func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { @@ -46,18 +46,18 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>, !xegpu.tensor_desc<16x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout} : - //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout> -> vector<16x32xf16> + //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout} : + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.layout> -> vector<16x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<16x32xf16> -> vector<16x32xf16> - //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout} : vector<16x32xf16> + //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout} : vector<16x32xf16> %c = arith.addf %a, %b : vector<16x32xf16> - //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout>> + //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16, #xegpu.layout> xegpu.store_nd %c, %arg2: vector<16x32xf16>, !xegpu.tensor_desc<16x32xf16> - //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout> + //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16x32xf16, #xegpu.layout> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<16x32xf16> @@ -85,18 +85,18 @@ gpu.module @test_kernel { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%arg0 = %a_tdesc, %arg1 = %b_tdesc, %arg2 = %c_tdesc) -> (!xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>, !xegpu.tensor_desc<12x32xf16>) { - //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout} : - //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout> -> vector<12x32xf16> + //CHECK: xegpu.load_nd {{.*}} {layout_result_0 = #xegpu.layout} : + //CHECK-SAME: !xegpu.tensor_desc<12x32xf16, #xegpu.layout> -> vector<12x32xf16> %a = xegpu.load_nd %arg0 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> %b = xegpu.load_nd %arg1 : !xegpu.tensor_desc<12x32xf16> -> vector<12x32xf16> - //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout} : vector<12x32xf16> + //CHECK-COUNT: arith.addf {{.*}} {layout_result_0 = #xegpu.layout} : vector<12x32xf16> %c = arith.addf %a, %b : vector<12x32xf16> - //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout>> + //CHECK-COUNT: xegpu.store_nd {{.*}} : vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16, #xegpu.layout> xegpu.store_nd %c, %arg2: vector<12x32xf16>, !xegpu.tensor_desc<12x32xf16> - //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout> + //CHECK-COUNT: xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<12x32xf16, #xegpu.layout> %a_next_tdesc = xegpu.update_nd_offset %arg0, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> %b_next_tdesc = xegpu.update_nd_offset %arg1, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> %c_next_tdesc = xegpu.update_nd_offset %arg2, [%c0, %c32] : !xegpu.tensor_desc<12x32xf16> @@ -114,7 +114,7 @@ gpu.module @test { // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> // CHECK: %{{.*}} = arith.constant {layout_result_0 = #xegpu.layout} dense<12> : vector<16xindex> // CHECK: %{{.*}} = xegpu.load %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> -// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> +// CHECK-SAME: {layout_result_0 = #xegpu.layout} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16> // CHECK: xegpu.store %0, %[[ARG0]][%{{.*}}], %{{.*}} <{chunk_size = 8 : i64}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1> func.func @scatter_ops_chunksize(%src: memref<256xf16>) { %1 = arith.constant dense<1>: vector<16xi1> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 61e315d0d2080..eb004932af4be 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout -split-input-file %s | FileCheck %s +// RUN: mlir-opt -xevm-attach-target='chip=pvc' -xegpu-propagate-layout="layout-kind=lane" -split-input-file %s | FileCheck %s gpu.module @test { // CHECK-LABEL: func.func @dpas_f16( diff --git a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir index 303584518f9f4..726b6748452ae 100644 --- a/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir +++ b/mlir/test/Dialect/XeGPU/transform-ops-invalid.mlir @@ -13,3 +13,61 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +// CHECK-LABEL: @set_op_layout_attr_bad_result_index +func.func @set_op_layout_attr_bad_result_index(%arg0: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error@below {{Index exceeds the number of op results}} + transform.xegpu.set_op_layout_attr %0 result index = 1 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @set_op_layout_attr_bad_operand_index +func.func @set_op_layout_attr_bad_operand_index(%arg0: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error@below {{Index exceeds the number of op operands}} + transform.xegpu.set_op_layout_attr %0 index = 1 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @set_op_layout_attr_multiple +func.func @set_op_layout_attr_multiple(%arg0: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32> + %3 = arith.extf %2 : vector<256x32xf32> to vector<256x32xf64> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error@below {{Requires exactly one targetOp handle (got 2)}} + transform.xegpu.set_op_layout_attr %0 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op + transform.yield + } +} diff --git a/mlir/test/Dialect/XeGPU/transform-ops.mlir b/mlir/test/Dialect/XeGPU/transform-ops.mlir index 23e1cd946b4cd..bd6a79244ed30 100644 --- a/mlir/test/Dialect/XeGPU/transform-ops.mlir +++ b/mlir/test/Dialect/XeGPU/transform-ops.mlir @@ -1,5 +1,67 @@ // RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s +// CHECK-LABEL: @get_desc_op_a +func.func @get_desc_op_a(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) { + %c32 = arith.constant 32 : index + %c4096 = arith.constant 4096 : index + %c0 = arith.constant 0 : index + %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16> + %1 = xegpu.load_nd %0[%c0, %c0] : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16> + // expected-remark @below {{found desc op}} + %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16> + %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) { + %5 = xegpu.load_nd %3[%c0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + %6 = xegpu.load_nd %4[%arg3, %c0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16> + %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16> + scf.yield %7 : vector<256x256xf16> + } + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_operand %0[0] : (!transform.any_op) -> !transform.any_value + %2 = transform.xegpu.get_desc_op %1 : (!transform.any_value) -> !transform.any_op + transform.debug.emit_remark_at %2, "found desc op" : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @get_desc_op_c +func.func @get_desc_op_c(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) { + %c32 = arith.constant 32 : index + %c4096 = arith.constant 4096 : index + %c0 = arith.constant 0 : index + // expected-remark @below {{found desc op}} + %0 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16> + %1 = xegpu.load_nd %0[%c0, %c0] : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16> + %3 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %4 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16> + %2 = scf.for %arg3 = %c0 to %c4096 step %c32 iter_args(%arg4 = %1) -> (vector<256x256xf16>) { + %5 = xegpu.load_nd %3[%c0, %arg3] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + %6 = xegpu.load_nd %4[%arg3, %c0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16> + %7 = xegpu.dpas %5, %6, %arg4 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16> + scf.yield %7 : vector<256x256xf16> + } + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.get_operand %0[2] : (!transform.any_op) -> !transform.any_value + %2 = transform.xegpu.get_desc_op %1 : (!transform.any_value) -> !transform.any_op + transform.debug.emit_remark_at %2, "found desc op" : !transform.any_op + transform.yield + } +} + +// ----- + // CHECK-LABEL: @set_desc_layout func.func @set_desc_layout(%arg0: memref<4096x4096xf16>) { // CHECK: %[[V0:.+]] = xegpu.create_nd_tdesc %arg0 @@ -56,3 +118,137 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +// CHECK-LABEL: @set_op_layout_attr_result_default_index +func.func @set_op_layout_attr_result_default_index(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>, %arg2: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + %2 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<32x256xf16> + %3 = xegpu.load_nd %2[0, 0] : !xegpu.tensor_desc<32x256xf16> -> vector<32x256xf16> + %4 = xegpu.create_nd_tdesc %arg2 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x256xf16> + %5 = xegpu.load_nd %4[0, 0] : !xegpu.tensor_desc<256x256xf16> -> vector<256x256xf16> + // CHECK: = xegpu.dpas + // CHECK-SAME: {layout_result_0 = #xegpu.layout} + %6 = xegpu.dpas %1, %3, %5 : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf16> -> vector<256x256xf16> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["xegpu.dpas"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.xegpu.set_op_layout_attr %{{.*}} + transform.xegpu.set_op_layout_attr %0 result sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @set_op_layout_attr_result_sg_param +func.func @set_op_layout_attr_result_sg_param(%arg0: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + // CHECK: = arith.extf %1 + // CHECK-SAME: {layout_result_0 = #xegpu.layout} + %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.xegpu.set_op_layout_attr %{{.*}} + %layout0 = transform.param.constant 8 : i64 -> !transform.param + transform.xegpu.set_op_layout_attr %0 result sg_layout = [%layout0, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op, !transform.param + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @set_op_layout_attr_result_sg_param2 +func.func @set_op_layout_attr_result_sg_param2(%arg0: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + // CHECK: = arith.extf %1 + // CHECK-SAME: {layout_result_0 = #xegpu.layout} + %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.xegpu.set_op_layout_attr %{{.*}} + %layout0 = transform.param.constant 8 : i64 -> !transform.param + %layout1 = transform.param.constant 4 : i64 -> !transform.param + transform.xegpu.set_op_layout_attr %0 result sg_layout = [%layout0, %layout1] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op, !transform.param, !transform.param + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @set_op_layout_attr_result0 +func.func @set_op_layout_attr_result0(%arg0: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + // CHECK: = arith.extf %1 + // CHECK-SAME: {layout_result_0 = #xegpu.layout} + %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.xegpu.set_op_layout_attr %{{.*}} + transform.xegpu.set_op_layout_attr %0 result index = 0 sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK-LABEL: @set_op_layout_attr_operand_minimal +func.func @set_op_layout_attr_operand_minimal(%arg0: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + // CHECK: = arith.extf %1 + // CHECK-SAME: {layout_operand_0 = #xegpu.layout} + %2 = arith.extf %1 : vector<256x32xf16> to vector<256x32xf32> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.extf"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.xegpu.set_op_layout_attr %{{.*}} + transform.xegpu.set_op_layout_attr %0 sg_layout = [8, 4] sg_data = [32, 64] : !transform.any_op + transform.yield + } +} +// ----- + +// CHECK-LABEL: @set_op_layout_attr_operand1 +func.func @set_op_layout_attr_operand1(%arg0: memref<4096x4096xf16>, %arg1: memref<4096x4096xf16>) { + %0 = xegpu.create_nd_tdesc %arg0 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %1 = xegpu.load_nd %0[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + %2 = xegpu.create_nd_tdesc %arg1 : memref<4096x4096xf16> -> !xegpu.tensor_desc<256x32xf16> + %3 = xegpu.load_nd %2[0, 0] : !xegpu.tensor_desc<256x32xf16> -> vector<256x32xf16> + // CHECK: = arith.addf %1, %3 + // CHECK-SAME: {layout_operand_1 = #xegpu.layout} + %6 = arith.addf %1, %3 : vector<256x32xf16> + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["arith.addf"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // CHECK: transform.xegpu.set_op_layout_attr %{{.*}} + transform.xegpu.set_op_layout_attr %0 index = 1 sg_layout = [8, 4] sg_data = [32, 64] inst_data = [8, 16] : !transform.any_op + transform.yield + } +} diff --git a/mlir/test/python/dialects/transform_xegpu_ext.py b/mlir/test/python/dialects/transform_xegpu_ext.py index 1c8a2bcc6a2fb..0b587d2020aa6 100644 --- a/mlir/test/python/dialects/transform_xegpu_ext.py +++ b/mlir/test/python/dialects/transform_xegpu_ext.py @@ -3,7 +3,7 @@ from mlir.ir import * from mlir.dialects import transform from mlir.dialects.transform import xegpu -from mlir.dialects.transform import structured +from mlir.dialects.transform import AnyValueType def run(f): @@ -16,6 +16,21 @@ def run(f): return f +@run +def getDescOpDefaultIndex(): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, + [], + transform.OperationType.get("xegpu.dpas"), + ) + with InsertionPoint(sequence.body): + operand = transform.GetOperandOp(AnyValueType.get(), sequence.bodyTarget, [0]) + desc_handle = xegpu.GetDescOp(operand) + transform.YieldOp() + # CHECK-LABEL: TEST: getDescOpDefaultIndex + # CHECK: transform.xegpu.get_desc_op % + + @run def setDescLayoutMinimal(): sequence = transform.SequenceOp( @@ -49,3 +64,52 @@ def setDescLayoutInstData(): # CHECK: sg_layout = [6, 4] # CHECK: sg_data = [32, 16] # CHECK: inst_data = [8, 16] + + +@run +def setOpLayoutAttrOperandMinimal(): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, + [], + transform.OperationType.get("xegpu.dpas"), + ) + with InsertionPoint(sequence.body): + xegpu.SetOpLayoutAttrOp( + sequence.bodyTarget, + sg_layout=[6, 4], + sg_data=[32, 16], + ) + transform.YieldOp() + # CHECK-LABEL: TEST: setOpLayoutAttr + # CHECK: transform.xegpu.set_op_layout_attr % + # NO-CHECK: index = 0 + # NO-CHECK: result + # CHECK: sg_layout = [6, 4] + # CHECK: sg_data = [32, 16] + # NO-CHECK: inst_data + + +@run +def setOpLayoutAttrResult(): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, + [], + transform.OperationType.get("xegpu.dpas"), + ) + with InsertionPoint(sequence.body): + xegpu.SetOpLayoutAttrOp( + sequence.bodyTarget, + index=0, + sg_layout=[6, 4], + sg_data=[32, 16], + inst_data=[8, 16], + result=True, + ) + transform.YieldOp() + # CHECK-LABEL: TEST: setOpLayoutAttr + # CHECK: transform.xegpu.set_op_layout_attr % + # NO-CHECK: index = 0 + # CHECK: result + # CHECK: sg_layout = [6, 4] + # CHECK: sg_data = [32, 16] + # CHECK: inst_data = [8, 16] diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c index 42d8ae43beba1..b3b72db080392 100644 --- a/offload/test/offloading/gpupgo/pgo_atomic_teams.c +++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c @@ -18,7 +18,6 @@ // REQUIRES: amdgpu // REQUIRES: pgo -// XFAIL: amdgpu int test1(int a) { return a / 2; } int test2(int a) { return a * 2; } diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c index 09a4dc1577822..440a6b533317d 100644 --- a/offload/test/offloading/gpupgo/pgo_atomic_threads.c +++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c @@ -18,7 +18,6 @@ // REQUIRES: amdgpu // REQUIRES: pgo -// XFAIL: amdgpu int test1(int a) { return a / 2; } diff --git a/offload/test/offloading/gpupgo/pgo_device_and_host.c b/offload/test/offloading/gpupgo/pgo_device_and_host.c index c53e69a25e50d..3e95791ce9a50 100644 --- a/offload/test/offloading/gpupgo/pgo_device_and_host.c +++ b/offload/test/offloading/gpupgo/pgo_device_and_host.c @@ -50,7 +50,6 @@ // REQUIRES: amdgpu // REQUIRES: pgo -// XFAIL: amdgpu int main() { int host_var = 0; diff --git a/offload/test/offloading/gpupgo/pgo_device_only.c b/offload/test/offloading/gpupgo/pgo_device_only.c index 644df6e7b0339..2939af613b6dd 100644 --- a/offload/test/offloading/gpupgo/pgo_device_only.c +++ b/offload/test/offloading/gpupgo/pgo_device_only.c @@ -16,7 +16,6 @@ // REQUIRES: amdgpu // REQUIRES: pgo -// XFAIL: amdgpu int test1(int a) { return a / 2; } int test2(int a) { return a * 2; } diff --git a/revert_patches.txt b/revert_patches.txt index 9e465ba90ae6a..b88b846f64b68 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -5,3 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485) breaks build of ROCmValidationSuite [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662) --- +breaks fortran declare-target-link1 +[OMPIRBuilder] Fix addrspace of internal critical section lock (#166459 +--- diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index a4ea627fb3d16..7066f498c7d49 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3888,6 +3888,7 @@ cc_library( ":XeGPUDialect", ":XeGPUTransformOpsIncGen", ":XeGPUUtils", + "//llvm:Support", ], )