diff --git a/clang-tools-extra/clangd/SemanticSelection.cpp b/clang-tools-extra/clangd/SemanticSelection.cpp index 3353121a01825..c2dad53bcec6b 100644 --- a/clang-tools-extra/clangd/SemanticSelection.cpp +++ b/clang-tools-extra/clangd/SemanticSelection.cpp @@ -11,9 +11,13 @@ #include "Protocol.h" #include "Selection.h" #include "SourceCode.h" +#include "support/Bracket.h" +#include "support/DirectiveTree.h" +#include "support/Token.h" #include "clang/AST/DeclBase.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" +#include "clang/Basic/TokenKinds.h" #include "clang/Tooling/Syntax/BuildTree.h" #include "clang/Tooling/Syntax/Nodes.h" #include "clang/Tooling/Syntax/TokenBufferTokenManager.h" @@ -22,9 +26,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Error.h" -#include "support/Bracket.h" -#include "support/DirectiveTree.h" -#include "support/Token.h" #include #include #include @@ -163,6 +164,69 @@ llvm::Expected getSemanticRanges(ParsedAST &AST, Position Pos) { return std::move(Head); } +class PragmaRegionFinder { + // Record the token range of a region: + // + // #pragma region name[[ + // ... + // ]]#pragma endregion + std::vector &Ranges; + const TokenStream &Code; + // Stack of starting token (the name of the region) indices for nested #pragma + // region. + std::vector Stack; + +public: + PragmaRegionFinder(std::vector &Ranges, const TokenStream &Code) + : Ranges(Ranges), Code(Code) {} + + void walk(const DirectiveTree &T) { + for (const auto &C : T.Chunks) + std::visit(*this, C); + } + + void operator()(const DirectiveTree::Code &C) {} + + void operator()(const DirectiveTree::Directive &D) { + // Get the tokens that make up this directive. + auto Tokens = Code.tokens(D.Tokens); + if (Tokens.empty()) + return; + const Token &HashToken = Tokens.front(); + assert(HashToken.Kind == tok::hash); + const Token &Pragma = HashToken.nextNC(); + if (Pragma.text() != "pragma") + return; + const Token &Value = Pragma.nextNC(); + + // Handle "#pragma region name" + if (Value.text() == "region") { + // Find the last token at the same line. + const Token *T = &Value.next(); + while (T < Tokens.end() && T->Line == Pragma.Line) + T = &T->next(); + --T; + Stack.push_back(T->OriginalIndex); + return; + } + + // Handle "#pragma endregion" + if (Value.text() == "endregion") { + if (Stack.empty()) + return; // unmatched end region; ignore. + + unsigned StartIdx = Stack.back(); + Stack.pop_back(); + Ranges.push_back(Token::Range{StartIdx, HashToken.OriginalIndex}); + } + } + + void operator()(const DirectiveTree::Conditional &C) { + for (const auto &[_, SubTree] : C.Branches) + walk(SubTree); + } +}; + // FIXME(kirillbobyrev): Collect comments, PP conditional regions, includes and // other code regions (e.g. public/private/protected sections of classes, // control flow statement bodies). @@ -286,6 +350,17 @@ getFoldingRanges(const std::string &Code, bool LineFoldingOnly) { } AddFoldingRange(Start, End, FoldingRange::COMMENT_KIND); } + + // #pragma region + std::vector Ranges; + PragmaRegionFinder(Ranges, OrigStream).walk(DirectiveStructure); + auto Ts = OrigStream.tokens(); + for (const auto &R : Ranges) { + auto End = StartPosition(Ts[R.End]); + if (LineFoldingOnly) + End.line--; + AddFoldingRange(EndPosition(Ts[R.Begin]), End, FoldingRange::REGION_KIND); + } return Result; } diff --git a/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp b/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp index 4efae25dcd077..2a381d1c8add5 100644 --- a/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp +++ b/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp @@ -410,6 +410,22 @@ TEST(FoldingRanges, PseudoParserWithoutLineFoldings) { Variable = 3; # )cpp", + R"cpp( + #pragma region R1[[ + + #pragma region R2[[ + constexpr int a = 2; + ]]#pragma endregion + + ]]#pragma endregion + )cpp", + R"cpp( + #pragma region[[ + ]]#pragma endregion + + #pragma /*comment1*/ region /*comment2*/name[[ + ]]#pragma endregion + )cpp", }; for (const char *Test : Tests) { auto T = Annotations(Test); @@ -470,6 +486,12 @@ TEST(FoldingRanges, PseudoParserLineFoldingsOnly) { //[[ foo /* bar */]] )cpp", + R"cpp( + #pragma region abc[[ + constexpr int a = 2; + ]] + #pragma endregion + )cpp", // FIXME: Support folding template arguments. // R"cpp( // template <[[typename foo, class bar]]> struct baz {}; diff --git a/clang/include/clang/AST/ASTImporter.h b/clang/include/clang/AST/ASTImporter.h index 4a0ca45b785a9..39d1429639ed1 100644 --- a/clang/include/clang/AST/ASTImporter.h +++ b/clang/include/clang/AST/ASTImporter.h @@ -190,6 +190,16 @@ class TypeSourceInfo; llvm::SmallDenseMap Aux; }; + class FunctionDeclImportCycleDetector { + public: + auto makeScopedCycleDetection(const FunctionDecl *D); + + bool isCycle(const FunctionDecl *D) const; + + private: + llvm::DenseSet FunctionDeclsWithImportInProgress; + }; + private: std::shared_ptr SharedState = nullptr; @@ -254,6 +264,12 @@ class TypeSourceInfo; /// Declaration (from, to) pairs that are known not to be equivalent /// (which we have already complained about). NonEquivalentDeclSet NonEquivalentDecls; + /// A FunctionDecl can have properties that have a reference to the + /// function itself and are imported before the function is created. This + /// can come for example from auto return type or when template parameters + /// are used in the return type or parameters. This member is used to detect + /// cyclic import of FunctionDecl objects to avoid infinite recursion. + FunctionDeclImportCycleDetector FindFunctionDeclImportCycle; using FoundDeclsTy = SmallVector; FoundDeclsTy findDeclsInToCtx(DeclContext *DC, DeclarationName Name); diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index 7972d05bedbf7..72c5efde7449b 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -313,12 +313,8 @@ template class OMPVarListClause : public OMPClause { unsigned varlist_size() const { return NumVars; } bool varlist_empty() const { return NumVars == 0; } - varlist_range varlist() { - return varlist_range(varlist_begin(), varlist_end()); - } - varlist_const_range varlist() const { - return varlist_const_range(varlist_begin(), varlist_end()); - } + varlist_range varlist() { return getVarRefs(); } + varlist_const_range varlist() const { return getVarRefs(); } varlist_iterator varlist_begin() { return getVarRefs().begin(); } varlist_iterator varlist_end() { return getVarRefs().end(); } @@ -3404,14 +3400,10 @@ class OMPPrivateClause final using private_copies_const_range = llvm::iterator_range; - private_copies_range private_copies() { - return private_copies_range(getPrivateCopies().begin(), - getPrivateCopies().end()); - } + private_copies_range private_copies() { return getPrivateCopies(); } private_copies_const_range private_copies() const { - return private_copies_const_range(getPrivateCopies().begin(), - getPrivateCopies().end()); + return getPrivateCopies(); } child_range children() { @@ -3531,13 +3523,9 @@ class OMPFirstprivateClause final using private_copies_const_range = llvm::iterator_range; - private_copies_range private_copies() { - return private_copies_range(getPrivateCopies().begin(), - getPrivateCopies().end()); - } + private_copies_range private_copies() { return getPrivateCopies(); } private_copies_const_range private_copies() const { - return private_copies_const_range(getPrivateCopies().begin(), - getPrivateCopies().end()); + return getPrivateCopies(); } using inits_iterator = MutableArrayRef::iterator; @@ -3545,12 +3533,8 @@ class OMPFirstprivateClause final using inits_range = llvm::iterator_range; using inits_const_range = llvm::iterator_range; - inits_range inits() { - return inits_range(getInits().begin(), getInits().end()); - } - inits_const_range inits() const { - return inits_const_range(getInits().begin(), getInits().end()); - } + inits_range inits() { return getInits(); } + inits_const_range inits() const { return getInits(); } child_range children() { return child_range(reinterpret_cast(varlist_begin()), @@ -3752,44 +3736,23 @@ class OMPLastprivateClause final /// copies of original lastprivate variables. void setPrivateCopies(ArrayRef PrivateCopies); - helper_expr_const_range private_copies() const { - return helper_expr_const_range(getPrivateCopies().begin(), - getPrivateCopies().end()); - } + helper_expr_const_range private_copies() const { return getPrivateCopies(); } - helper_expr_range private_copies() { - return helper_expr_range(getPrivateCopies().begin(), - getPrivateCopies().end()); - } + helper_expr_range private_copies() { return getPrivateCopies(); } - helper_expr_const_range source_exprs() const { - return helper_expr_const_range(getSourceExprs().begin(), - getSourceExprs().end()); - } + helper_expr_const_range source_exprs() const { return getSourceExprs(); } - helper_expr_range source_exprs() { - return helper_expr_range(getSourceExprs().begin(), getSourceExprs().end()); - } + helper_expr_range source_exprs() { return getSourceExprs(); } helper_expr_const_range destination_exprs() const { - return helper_expr_const_range(getDestinationExprs().begin(), - getDestinationExprs().end()); + return getDestinationExprs(); } - helper_expr_range destination_exprs() { - return helper_expr_range(getDestinationExprs().begin(), - getDestinationExprs().end()); - } + helper_expr_range destination_exprs() { return getDestinationExprs(); } - helper_expr_const_range assignment_ops() const { - return helper_expr_const_range(getAssignmentOps().begin(), - getAssignmentOps().end()); - } + helper_expr_const_range assignment_ops() const { return getAssignmentOps(); } - helper_expr_range assignment_ops() { - return helper_expr_range(getAssignmentOps().begin(), - getAssignmentOps().end()); - } + helper_expr_range assignment_ops() { return getAssignmentOps(); } child_range children() { return child_range(reinterpret_cast(varlist_begin()), @@ -4178,79 +4141,45 @@ class OMPReductionClause final using helper_flag_const_range = llvm::iterator_range; - helper_expr_const_range privates() const { - return helper_expr_const_range(getPrivates().begin(), getPrivates().end()); - } + helper_expr_const_range privates() const { return getPrivates(); } - helper_expr_range privates() { - return helper_expr_range(getPrivates().begin(), getPrivates().end()); - } + helper_expr_range privates() { return getPrivates(); } - helper_expr_const_range lhs_exprs() const { - return helper_expr_const_range(getLHSExprs().begin(), getLHSExprs().end()); - } + helper_expr_const_range lhs_exprs() const { return getLHSExprs(); } - helper_expr_range lhs_exprs() { - return helper_expr_range(getLHSExprs().begin(), getLHSExprs().end()); - } + helper_expr_range lhs_exprs() { return getLHSExprs(); } - helper_expr_const_range rhs_exprs() const { - return helper_expr_const_range(getRHSExprs().begin(), getRHSExprs().end()); - } + helper_expr_const_range rhs_exprs() const { return getRHSExprs(); } - helper_expr_range rhs_exprs() { - return helper_expr_range(getRHSExprs().begin(), getRHSExprs().end()); - } + helper_expr_range rhs_exprs() { return getRHSExprs(); } helper_flag_const_range private_var_reduction_flags() const { - return helper_flag_const_range(getPrivateVariableReductionFlags().begin(), - getPrivateVariableReductionFlags().end()); + return getPrivateVariableReductionFlags(); } helper_flag_range private_var_reduction_flags() { - return helper_flag_range(getPrivateVariableReductionFlags().begin(), - getPrivateVariableReductionFlags().end()); + return getPrivateVariableReductionFlags(); } - helper_expr_const_range reduction_ops() const { - return helper_expr_const_range(getReductionOps().begin(), - getReductionOps().end()); - } + helper_expr_const_range reduction_ops() const { return getReductionOps(); } - helper_expr_range reduction_ops() { - return helper_expr_range(getReductionOps().begin(), - getReductionOps().end()); - } + helper_expr_range reduction_ops() { return getReductionOps(); } - helper_expr_const_range copy_ops() const { - return helper_expr_const_range(getInscanCopyOps().begin(), - getInscanCopyOps().end()); - } + helper_expr_const_range copy_ops() const { return getInscanCopyOps(); } - helper_expr_range copy_ops() { - return helper_expr_range(getInscanCopyOps().begin(), - getInscanCopyOps().end()); - } + helper_expr_range copy_ops() { return getInscanCopyOps(); } helper_expr_const_range copy_array_temps() const { - return helper_expr_const_range(getInscanCopyArrayTemps().begin(), - getInscanCopyArrayTemps().end()); + return getInscanCopyArrayTemps(); } - helper_expr_range copy_array_temps() { - return helper_expr_range(getInscanCopyArrayTemps().begin(), - getInscanCopyArrayTemps().end()); - } + helper_expr_range copy_array_temps() { return getInscanCopyArrayTemps(); } helper_expr_const_range copy_array_elems() const { - return helper_expr_const_range(getInscanCopyArrayElems().begin(), - getInscanCopyArrayElems().end()); + return getInscanCopyArrayElems(); } - helper_expr_range copy_array_elems() { - return helper_expr_range(getInscanCopyArrayElems().begin(), - getInscanCopyArrayElems().end()); - } + helper_expr_range copy_array_elems() { return getInscanCopyArrayElems(); } child_range children() { return child_range(reinterpret_cast(varlist_begin()), @@ -4450,39 +4379,21 @@ class OMPTaskReductionClause final using helper_expr_const_range = llvm::iterator_range; - helper_expr_const_range privates() const { - return helper_expr_const_range(getPrivates().begin(), getPrivates().end()); - } + helper_expr_const_range privates() const { return getPrivates(); } - helper_expr_range privates() { - return helper_expr_range(getPrivates().begin(), getPrivates().end()); - } + helper_expr_range privates() { return getPrivates(); } - helper_expr_const_range lhs_exprs() const { - return helper_expr_const_range(getLHSExprs().begin(), getLHSExprs().end()); - } + helper_expr_const_range lhs_exprs() const { return getLHSExprs(); } - helper_expr_range lhs_exprs() { - return helper_expr_range(getLHSExprs().begin(), getLHSExprs().end()); - } + helper_expr_range lhs_exprs() { return getLHSExprs(); } - helper_expr_const_range rhs_exprs() const { - return helper_expr_const_range(getRHSExprs().begin(), getRHSExprs().end()); - } + helper_expr_const_range rhs_exprs() const { return getRHSExprs(); } - helper_expr_range rhs_exprs() { - return helper_expr_range(getRHSExprs().begin(), getRHSExprs().end()); - } + helper_expr_range rhs_exprs() { return getRHSExprs(); } - helper_expr_const_range reduction_ops() const { - return helper_expr_const_range(getReductionOps().begin(), - getReductionOps().end()); - } + helper_expr_const_range reduction_ops() const { return getReductionOps(); } - helper_expr_range reduction_ops() { - return helper_expr_range(getReductionOps().begin(), - getReductionOps().end()); - } + helper_expr_range reduction_ops() { return getReductionOps(); } child_range children() { return child_range(reinterpret_cast(varlist_begin()), @@ -4694,48 +4605,28 @@ class OMPInReductionClause final using helper_expr_const_range = llvm::iterator_range; - helper_expr_const_range privates() const { - return helper_expr_const_range(getPrivates().begin(), getPrivates().end()); - } + helper_expr_const_range privates() const { return getPrivates(); } - helper_expr_range privates() { - return helper_expr_range(getPrivates().begin(), getPrivates().end()); - } + helper_expr_range privates() { return getPrivates(); } - helper_expr_const_range lhs_exprs() const { - return helper_expr_const_range(getLHSExprs().begin(), getLHSExprs().end()); - } + helper_expr_const_range lhs_exprs() const { return getLHSExprs(); } - helper_expr_range lhs_exprs() { - return helper_expr_range(getLHSExprs().begin(), getLHSExprs().end()); - } + helper_expr_range lhs_exprs() { return getLHSExprs(); } - helper_expr_const_range rhs_exprs() const { - return helper_expr_const_range(getRHSExprs().begin(), getRHSExprs().end()); - } + helper_expr_const_range rhs_exprs() const { return getRHSExprs(); } - helper_expr_range rhs_exprs() { - return helper_expr_range(getRHSExprs().begin(), getRHSExprs().end()); - } + helper_expr_range rhs_exprs() { return getRHSExprs(); } - helper_expr_const_range reduction_ops() const { - return helper_expr_const_range(getReductionOps().begin(), - getReductionOps().end()); - } + helper_expr_const_range reduction_ops() const { return getReductionOps(); } - helper_expr_range reduction_ops() { - return helper_expr_range(getReductionOps().begin(), - getReductionOps().end()); - } + helper_expr_range reduction_ops() { return getReductionOps(); } helper_expr_const_range taskgroup_descriptors() const { - return helper_expr_const_range(getTaskgroupDescriptors().begin(), - getTaskgroupDescriptors().end()); + return getTaskgroupDescriptors(); } helper_expr_range taskgroup_descriptors() { - return helper_expr_range(getTaskgroupDescriptors().begin(), - getTaskgroupDescriptors().end()); + return getTaskgroupDescriptors(); } child_range children() { @@ -4965,52 +4856,36 @@ class OMPLinearClause final using privates_range = llvm::iterator_range; using privates_const_range = llvm::iterator_range; - privates_range privates() { - return privates_range(getPrivates().begin(), getPrivates().end()); - } + privates_range privates() { return getPrivates(); } - privates_const_range privates() const { - return privates_const_range(getPrivates().begin(), getPrivates().end()); - } + privates_const_range privates() const { return getPrivates(); } using inits_iterator = MutableArrayRef::iterator; using inits_const_iterator = ArrayRef::iterator; using inits_range = llvm::iterator_range; using inits_const_range = llvm::iterator_range; - inits_range inits() { - return inits_range(getInits().begin(), getInits().end()); - } + inits_range inits() { return getInits(); } - inits_const_range inits() const { - return inits_const_range(getInits().begin(), getInits().end()); - } + inits_const_range inits() const { return getInits(); } using updates_iterator = MutableArrayRef::iterator; using updates_const_iterator = ArrayRef::iterator; using updates_range = llvm::iterator_range; using updates_const_range = llvm::iterator_range; - updates_range updates() { - return updates_range(getUpdates().begin(), getUpdates().end()); - } + updates_range updates() { return getUpdates(); } - updates_const_range updates() const { - return updates_const_range(getUpdates().begin(), getUpdates().end()); - } + updates_const_range updates() const { return getUpdates(); } using finals_iterator = MutableArrayRef::iterator; using finals_const_iterator = ArrayRef::iterator; using finals_range = llvm::iterator_range; using finals_const_range = llvm::iterator_range; - finals_range finals() { - return finals_range(getFinals().begin(), getFinals().end()); - } + finals_range finals() { return getFinals(); } - finals_const_range finals() const { - return finals_const_range(getFinals().begin(), getFinals().end()); - } + finals_const_range finals() const { return getFinals(); } using used_expressions_iterator = MutableArrayRef::iterator; using used_expressions_const_iterator = ArrayRef::iterator; @@ -5270,34 +5145,19 @@ class OMPCopyinClause final using helper_expr_const_range = llvm::iterator_range; - helper_expr_const_range source_exprs() const { - return helper_expr_const_range(getSourceExprs().begin(), - getSourceExprs().end()); - } + helper_expr_const_range source_exprs() const { return getSourceExprs(); } - helper_expr_range source_exprs() { - return helper_expr_range(getSourceExprs().begin(), getSourceExprs().end()); - } + helper_expr_range source_exprs() { return getSourceExprs(); } helper_expr_const_range destination_exprs() const { - return helper_expr_const_range(getDestinationExprs().begin(), - getDestinationExprs().end()); + return getDestinationExprs(); } - helper_expr_range destination_exprs() { - return helper_expr_range(getDestinationExprs().begin(), - getDestinationExprs().end()); - } + helper_expr_range destination_exprs() { return getDestinationExprs(); } - helper_expr_const_range assignment_ops() const { - return helper_expr_const_range(getAssignmentOps().begin(), - getAssignmentOps().end()); - } + helper_expr_const_range assignment_ops() const { return getAssignmentOps(); } - helper_expr_range assignment_ops() { - return helper_expr_range(getAssignmentOps().begin(), - getAssignmentOps().end()); - } + helper_expr_range assignment_ops() { return getAssignmentOps(); } child_range children() { return child_range(reinterpret_cast(varlist_begin()), @@ -5433,34 +5293,19 @@ class OMPCopyprivateClause final using helper_expr_const_range = llvm::iterator_range; - helper_expr_const_range source_exprs() const { - return helper_expr_const_range(getSourceExprs().begin(), - getSourceExprs().end()); - } + helper_expr_const_range source_exprs() const { return getSourceExprs(); } - helper_expr_range source_exprs() { - return helper_expr_range(getSourceExprs().begin(), getSourceExprs().end()); - } + helper_expr_range source_exprs() { return getSourceExprs(); } helper_expr_const_range destination_exprs() const { - return helper_expr_const_range(getDestinationExprs().begin(), - getDestinationExprs().end()); + return getDestinationExprs(); } - helper_expr_range destination_exprs() { - return helper_expr_range(getDestinationExprs().begin(), - getDestinationExprs().end()); - } + helper_expr_range destination_exprs() { return getDestinationExprs(); } - helper_expr_const_range assignment_ops() const { - return helper_expr_const_range(getAssignmentOps().begin(), - getAssignmentOps().end()); - } + helper_expr_const_range assignment_ops() const { return getAssignmentOps(); } - helper_expr_range assignment_ops() { - return helper_expr_range(getAssignmentOps().begin(), - getAssignmentOps().end()); - } + helper_expr_range assignment_ops() { return getAssignmentOps(); } child_range children() { return child_range(reinterpret_cast(varlist_begin()), @@ -6632,18 +6477,14 @@ class OMPMappableExprListClause : public OMPVarListClause, using const_all_decls_iterator = ArrayRef::iterator; using const_all_decls_range = llvm::iterator_range; - const_all_decls_range all_decls() const { - auto A = getUniqueDeclsRef(); - return const_all_decls_range(A.begin(), A.end()); - } + const_all_decls_range all_decls() const { return getUniqueDeclsRef(); } using const_all_num_lists_iterator = ArrayRef::iterator; using const_all_num_lists_range = llvm::iterator_range; const_all_num_lists_range all_num_lists() const { - auto A = getDeclNumListsRef(); - return const_all_num_lists_range(A.begin(), A.end()); + return getDeclNumListsRef(); } using const_all_lists_sizes_iterator = ArrayRef::iterator; @@ -6651,8 +6492,7 @@ class OMPMappableExprListClause : public OMPVarListClause, llvm::iterator_range; const_all_lists_sizes_range all_lists_sizes() const { - auto A = getComponentListSizesRef(); - return const_all_lists_sizes_range(A.begin(), A.end()); + return getComponentListSizesRef(); } using const_all_components_iterator = ArrayRef::iterator; @@ -6660,8 +6500,7 @@ class OMPMappableExprListClause : public OMPVarListClause, llvm::iterator_range; const_all_components_range all_components() const { - auto A = getComponentsRef(); - return const_all_components_range(A.begin(), A.end()); + return getComponentsRef(); } using mapperlist_iterator = MutableArrayRef::iterator; @@ -8241,14 +8080,10 @@ class OMPUseDevicePtrClause final using private_copies_const_range = llvm::iterator_range; - private_copies_range private_copies() { - return private_copies_range(getPrivateCopies().begin(), - getPrivateCopies().end()); - } + private_copies_range private_copies() { return getPrivateCopies(); } private_copies_const_range private_copies() const { - return private_copies_const_range(getPrivateCopies().begin(), - getPrivateCopies().end()); + return getPrivateCopies(); } using inits_iterator = MutableArrayRef::iterator; @@ -8256,13 +8091,9 @@ class OMPUseDevicePtrClause final using inits_range = llvm::iterator_range; using inits_const_range = llvm::iterator_range; - inits_range inits() { - return inits_range(getInits().begin(), getInits().end()); - } + inits_range inits() { return getInits(); } - inits_const_range inits() const { - return inits_const_range(getInits().begin(), getInits().end()); - } + inits_const_range inits() const { return getInits(); } child_range children() { return child_range(reinterpret_cast(varlist_begin()), @@ -8904,8 +8735,7 @@ class OMPInitClause final } const_prefs_range prefs() const { - auto Prefs = const_cast(this)->prefs(); - return const_prefs_range(Prefs.begin(), Prefs.end()); + return const_prefs_range(const_cast(this)->prefs()); } static bool classof(const OMPClause *T) { diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 69d7f8e8c3094..187d32d928c6b 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -410,6 +410,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc") BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_fadd_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_fsub_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_fmin_f32, "ffZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_fmax_f32, "ffZi", "nc") //===----------------------------------------------------------------------===// // R600-NI only builtins. diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def index c4ea46a3bc5b5..a5eee613d5c9e 100644 --- a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def +++ b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def @@ -986,3 +986,22 @@ TARGET_BUILTIN(__builtin_lasx_xbnz_b, "iV32Uc", "nc", "lasx") TARGET_BUILTIN(__builtin_lasx_xbnz_h, "iV16Us", "nc", "lasx") TARGET_BUILTIN(__builtin_lasx_xbnz_w, "iV8Ui", "nc", "lasx") TARGET_BUILTIN(__builtin_lasx_xbnz_d, "iV4ULLi", "nc", "lasx") + +TARGET_BUILTIN(__builtin_lasx_cast_128_s, "V8fV4f", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_cast_128_d, "V4dV2d", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_cast_128, "V4LLiV2LLi", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_concat_128_s, "V8fV4fV4f", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_concat_128_d, "V4dV2dV2d", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_concat_128, "V4LLiV2LLiV2LLi", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_extract_128_lo_s, "V4fV8f", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_extract_128_lo_d, "V2dV4d", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_extract_128_lo, "V2LLiV4LLi", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_extract_128_hi_s, "V4fV8f", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_extract_128_hi_d, "V2dV4d", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_extract_128_hi, "V2LLiV4LLi", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_insert_128_lo_s, "V8fV8fV4f", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_insert_128_lo_d, "V4dV4dV2d", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_insert_128_lo, "V4LLiV4LLiV2LLi", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_insert_128_hi_s, "V8fV8fV4f", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_insert_128_hi_d, "V4dV4dV2d", "nc", "lasx") +TARGET_BUILTIN(__builtin_lasx_insert_128_hi, "V4LLiV4LLiV2LLi", "nc", "lasx") diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 735f3157b694e..c1441744c8578 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1291,6 +1291,26 @@ bool ASTNodeImporter::hasSameVisibilityContextAndLinkage(TypedefNameDecl *Found, using namespace clang; +auto ASTImporter::FunctionDeclImportCycleDetector::makeScopedCycleDetection( + const FunctionDecl *D) { + const FunctionDecl *LambdaD = nullptr; + if (!isCycle(D) && D) { + FunctionDeclsWithImportInProgress.insert(D); + LambdaD = D; + } + return llvm::make_scope_exit([this, LambdaD]() { + if (LambdaD) { + FunctionDeclsWithImportInProgress.erase(LambdaD); + } + }); +} + +bool ASTImporter::FunctionDeclImportCycleDetector::isCycle( + const FunctionDecl *D) const { + return FunctionDeclsWithImportInProgress.find(D) != + FunctionDeclsWithImportInProgress.end(); +} + ExpectedType ASTNodeImporter::VisitType(const Type *T) { Importer.FromDiag(SourceLocation(), diag::err_unsupported_ast_node) << T->getTypeClassName(); @@ -4038,7 +4058,10 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) { // E.g.: auto foo() { struct X{}; return X(); } // To avoid an infinite recursion when importing, create the FunctionDecl // with a simplified return type. - if (hasReturnTypeDeclaredInside(D)) { + // Reuse this approach for auto return types declared as typenames from + // template params, tracked in FindFunctionDeclImportCycle. + if (hasReturnTypeDeclaredInside(D) || + Importer.FindFunctionDeclImportCycle.isCycle(D)) { FromReturnTy = Importer.getFromContext().VoidTy; UsedDifferentProtoType = true; } @@ -4061,6 +4084,8 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) { } Error Err = Error::success(); + auto ScopedReturnTypeDeclCycleDetector = + Importer.FindFunctionDeclImportCycle.makeScopedCycleDetection(D); auto T = importChecked(Err, FromTy); auto TInfo = importChecked(Err, FromTSI); auto ToInnerLocStart = importChecked(Err, D->getInnerLocStart()); diff --git a/clang/lib/AST/ByteCode/ByteCodeEmitter.h b/clang/lib/AST/ByteCode/ByteCodeEmitter.h index ca8dc38e65246..dd18341d52a09 100644 --- a/clang/lib/AST/ByteCode/ByteCodeEmitter.h +++ b/clang/lib/AST/ByteCode/ByteCodeEmitter.h @@ -25,11 +25,11 @@ enum Opcode : uint32_t; /// An emitter which links the program to bytecode for later use. class ByteCodeEmitter { protected: - using LabelTy = uint32_t; using AddrTy = uintptr_t; using Local = Scope::Local; public: + using LabelTy = uint32_t; /// Compiles the function into the module. void compileFunc(const FunctionDecl *FuncDecl, Function *Func = nullptr); diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 725db1f77f29c..dd0b8e790d444 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -16,6 +16,7 @@ #include "PrimType.h" #include "Program.h" #include "clang/AST/Attr.h" +#include "llvm/Support/SaveAndRestore.h" using namespace clang; using namespace clang::interp; @@ -2500,17 +2501,18 @@ bool Compiler::VisitAbstractConditionalOperator( const Expr *TrueExpr = E->getTrueExpr(); const Expr *FalseExpr = E->getFalseExpr(); - auto visitChildExpr = [&](const Expr *E) -> bool { - LocalScope S(this); - if (!this->delegate(E)) - return false; - return S.destroyLocals(); - }; + // The TrueExpr and FalseExpr of a conditional operator do _not_ create a + // scope, which means the local variables created within them unconditionally + // always exist. However, we need to later differentiate which branch was + // taken and only destroy the varibles of the active branch. This is what the + // "enabled" flags on local variables are used for. + llvm::SaveAndRestore LAAA(this->VarScope->LocalsAlwaysEnabled, + /*NewValue=*/false); if (std::optional BoolValue = getBoolValue(Condition)) { if (*BoolValue) - return visitChildExpr(TrueExpr); - return visitChildExpr(FalseExpr); + return this->delegate(TrueExpr); + return this->delegate(FalseExpr); } bool IsBcpCall = false; @@ -2542,13 +2544,15 @@ bool Compiler::VisitAbstractConditionalOperator( if (!this->jumpFalse(LabelFalse)) return false; - if (!visitChildExpr(TrueExpr)) + if (!this->delegate(TrueExpr)) return false; + if (!this->jump(LabelEnd)) return false; this->emitLabel(LabelFalse); - if (!visitChildExpr(FalseExpr)) + if (!this->delegate(FalseExpr)) return false; + this->fallthrough(LabelEnd); this->emitLabel(LabelEnd); @@ -2823,10 +2827,10 @@ bool Compiler::VisitCompoundAssignOperator( return false; if (!this->emitLoad(*LT, E)) return false; - if (LT != LHSComputationT) { - if (!this->emitCast(*LT, *LHSComputationT, E)) - return false; - } + if (LT != LHSComputationT && + !this->emitIntegralCast(*LT, *LHSComputationT, E->getComputationLHSType(), + E)) + return false; // Get the RHS value on the stack. if (!this->emitGetLocal(*RT, TempOffset, E)) @@ -2879,10 +2883,9 @@ bool Compiler::VisitCompoundAssignOperator( } // And now cast from LHSComputationT to ResultT. - if (ResultT != LHSComputationT) { - if (!this->emitCast(*LHSComputationT, *ResultT, E)) - return false; - } + if (ResultT != LHSComputationT && + !this->emitIntegralCast(*LHSComputationT, *ResultT, E->getType(), E)) + return false; // And store the result in LHS. if (DiscardResult) { @@ -2955,10 +2958,15 @@ bool Compiler::VisitMaterializeTemporaryExpr( bool IsVolatile = SubExpr->getType().isVolatileQualified(); unsigned LocalIndex = allocateLocalPrimitive( E, *SubExprT, IsConst, IsVolatile, E->getExtendingDecl()); + if (!this->VarScope->LocalsAlwaysEnabled && + !this->emitEnableLocal(LocalIndex, E)) + return false; + if (!this->visit(SubExpr)) return false; if (!this->emitSetLocal(*SubExprT, LocalIndex, E)) return false; + return this->emitGetPtrLocal(LocalIndex, E); } @@ -2968,6 +2976,11 @@ bool Compiler::VisitMaterializeTemporaryExpr( if (UnsignedOrNone LocalIndex = allocateLocal(E, Inner->getType(), E->getExtendingDecl())) { InitLinkScope ILS(this, InitLink::Temp(*LocalIndex)); + + if (!this->VarScope->LocalsAlwaysEnabled && + !this->emitEnableLocal(*LocalIndex, E)) + return false; + if (!this->emitGetPtrLocal(*LocalIndex, E)) return false; return this->visitInitializer(SubExpr) && this->emitFinishInit(E); @@ -7229,6 +7242,19 @@ bool Compiler::emitPrimCast(PrimType FromT, PrimType ToT, return false; } +template +bool Compiler::emitIntegralCast(PrimType FromT, PrimType ToT, + QualType ToQT, const Expr *E) { + assert(FromT != ToT); + + if (ToT == PT_IntAP) + return this->emitCastAP(FromT, Ctx.getBitWidth(ToQT), E); + if (ToT == PT_IntAPS) + return this->emitCastAPS(FromT, Ctx.getBitWidth(ToQT), E); + + return this->emitCast(FromT, ToT, E); +} + /// Emits __real(SubExpr) template bool Compiler::emitComplexReal(const Expr *SubExpr) { diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index 359bf28a51c6e..54d39bbc25952 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -393,6 +393,8 @@ class Compiler : public ConstStmtVisitor, bool>, } bool emitPrimCast(PrimType FromT, PrimType ToT, QualType ToQT, const Expr *E); + bool emitIntegralCast(PrimType FromT, PrimType ToT, QualType ToQT, + const Expr *E); PrimType classifyComplexElementType(QualType T) const { assert(T->isAnyComplexType()); @@ -477,12 +479,14 @@ template class VariableScope { VariableScope(Compiler *Ctx, const ValueDecl *VD, ScopeKind Kind = ScopeKind::Block) : Ctx(Ctx), Parent(Ctx->VarScope), ValDecl(VD), Kind(Kind) { + if (Parent) + this->LocalsAlwaysEnabled = Parent->LocalsAlwaysEnabled; Ctx->VarScope = this; } virtual ~VariableScope() { Ctx->VarScope = this->Parent; } - virtual void addLocal(const Scope::Local &Local) { + virtual void addLocal(Scope::Local Local) { llvm_unreachable("Shouldn't be called"); } @@ -519,7 +523,6 @@ template class VariableScope { if (!P) break; } - // Add to this scope. this->addLocal(Local); } @@ -529,6 +532,11 @@ template class VariableScope { VariableScope *getParent() const { return Parent; } ScopeKind getKind() const { return Kind; } + /// Whether locals added to this scope are enabled by default. + /// This is almost always true, except for the two branches + /// of a conditional operator. + bool LocalsAlwaysEnabled = true; + protected: /// Compiler instance. Compiler *Ctx; @@ -566,29 +574,48 @@ template class LocalScope : public VariableScope { return Success; } - void addLocal(const Scope::Local &Local) override { + void addLocal(Scope::Local Local) override { if (!Idx) { Idx = static_cast(this->Ctx->Descriptors.size()); this->Ctx->Descriptors.emplace_back(); this->Ctx->emitInitScope(*Idx, {}); } + Local.EnabledByDefault = this->LocalsAlwaysEnabled; this->Ctx->Descriptors[*Idx].emplace_back(Local); } bool emitDestructors(const Expr *E = nullptr) override { if (!Idx) return true; + assert(!this->Ctx->Descriptors[*Idx].empty()); + // Emit destructor calls for local variables of record // type with a destructor. for (Scope::Local &Local : llvm::reverse(this->Ctx->Descriptors[*Idx])) { if (Local.Desc->hasTrivialDtor()) continue; - if (!this->Ctx->emitGetPtrLocal(Local.Offset, E)) - return false; - if (!this->Ctx->emitDestructionPop(Local.Desc, Local.Desc->getLoc())) - return false; + if (!Local.EnabledByDefault) { + typename Emitter::LabelTy EndLabel = this->Ctx->getLabel(); + if (!this->Ctx->emitGetLocalEnabled(Local.Offset, E)) + return false; + if (!this->Ctx->jumpFalse(EndLabel)) + return false; + + if (!this->Ctx->emitGetPtrLocal(Local.Offset, E)) + return false; + + if (!this->Ctx->emitDestructionPop(Local.Desc, Local.Desc->getLoc())) + return false; + + this->Ctx->emitLabel(EndLabel); + } else { + if (!this->Ctx->emitGetPtrLocal(Local.Offset, E)) + return false; + if (!this->Ctx->emitDestructionPop(Local.Desc, Local.Desc->getLoc())) + return false; + } removeIfStoredOpaqueValue(Local); } diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp index 007321791fdd4..a2e01efc8ffd9 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.cpp +++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp @@ -113,7 +113,7 @@ Scope::Local EvalEmitter::createLocal(Descriptor *D) { InlineDescriptor &Desc = *reinterpret_cast(B->rawData()); Desc.Desc = D; Desc.Offset = sizeof(InlineDescriptor); - Desc.IsActive = true; + Desc.IsActive = false; Desc.IsBase = false; Desc.IsFieldMutable = false; Desc.IsConst = false; @@ -322,6 +322,33 @@ bool EvalEmitter::emitDestroy(uint32_t I, SourceInfo Info) { return true; } +bool EvalEmitter::emitGetLocalEnabled(uint32_t I, SourceInfo Info) { + if (!isActive()) + return true; + + Block *B = getLocal(I); + const InlineDescriptor &Desc = + *reinterpret_cast(B->rawData()); + + S.Stk.push(Desc.IsActive); + return true; +} + +bool EvalEmitter::emitEnableLocal(uint32_t I, SourceInfo Info) { + if (!isActive()) + return true; + + // FIXME: This is a little dirty, but to avoid adding a flag to + // InlineDescriptor that's only ever useful on the toplevel of local + // variables, we reuse the IsActive flag for the enabled state. We should + // probably use a different struct than InlineDescriptor for the block-level + // inline descriptor of local varaibles. + Block *B = getLocal(I); + InlineDescriptor &Desc = *reinterpret_cast(B->rawData()); + Desc.IsActive = true; + return true; +} + /// Global temporaries (LifetimeExtendedTemporary) carry their value /// around as an APValue, which codegen accesses. /// We set their value once when creating them, but we don't update it diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h index 95add5809afcc..8c309c921afa9 100644 --- a/clang/lib/AST/ByteCode/Function.h +++ b/clang/lib/AST/ByteCode/Function.h @@ -41,6 +41,8 @@ class Scope final { unsigned Offset; /// Descriptor of the local. Descriptor *Desc; + /// If the cleanup for this local should be emitted. + bool EnabledByDefault = true; }; using LocalVectorTy = llvm::SmallVector; diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 3e869c1ee5f2c..86b1ba88ca9d4 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -2474,6 +2474,18 @@ inline bool InitScope(InterpState &S, CodePtr OpPC, uint32_t I) { return true; } +inline bool EnableLocal(InterpState &S, CodePtr OpPC, uint32_t I) { + assert(!S.Current->isLocalEnabled(I)); + S.Current->enableLocal(I); + return true; +} + +inline bool GetLocalEnabled(InterpState &S, CodePtr OpPC, uint32_t I) { + assert(S.Current); + S.Stk.push(S.Current->isLocalEnabled(I)); + return true; +} + //===----------------------------------------------------------------------===// // Cast, CastFP //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp index 039acb5d72b2c..3b883761ad001 100644 --- a/clang/lib/AST/ByteCode/InterpFrame.cpp +++ b/clang/lib/AST/ByteCode/InterpFrame.cpp @@ -89,11 +89,23 @@ void InterpFrame::destroyScopes() { void InterpFrame::initScope(unsigned Idx) { if (!Func) return; + for (auto &Local : Func->getScope(Idx).locals()) { localBlock(Local.Offset)->invokeCtor(); } } +void InterpFrame::enableLocal(unsigned Idx) { + assert(Func); + + // FIXME: This is a little dirty, but to avoid adding a flag to + // InlineDescriptor that's only ever useful on the toplevel of local + // variables, we reuse the IsActive flag for the enabled state. We should + // probably use a different struct than InlineDescriptor for the block-level + // inline descriptor of local varaibles. + localInlineDesc(Idx)->IsActive = true; +} + void InterpFrame::destroy(unsigned Idx) { for (auto &Local : Func->getScope(Idx).locals_reverse()) { S.deallocate(localBlock(Local.Offset)); diff --git a/clang/lib/AST/ByteCode/InterpFrame.h b/clang/lib/AST/ByteCode/InterpFrame.h index febef1097ea8a..e150e9279a6ef 100644 --- a/clang/lib/AST/ByteCode/InterpFrame.h +++ b/clang/lib/AST/ByteCode/InterpFrame.h @@ -55,6 +55,10 @@ class InterpFrame final : public Frame { void destroy(unsigned Idx); void initScope(unsigned Idx); void destroyScopes(); + void enableLocal(unsigned Idx); + bool isLocalEnabled(unsigned Idx) const { + return localInlineDesc(Idx)->IsActive; + } /// Describes the frame with arguments for diagnostic purposes. void describe(llvm::raw_ostream &OS) const override; diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index a236f89dcf78b..6e768793fcfcf 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -251,6 +251,16 @@ def InitScope : Opcode { let Args = [ArgUint32]; } +def GetLocalEnabled : Opcode { + let Args = [ArgUint32]; + let HasCustomEval = 1; +} + +def EnableLocal : Opcode { + let Args = [ArgUint32]; + let HasCustomEval = 1; +} + //===----------------------------------------------------------------------===// // Constants //===----------------------------------------------------------------------===// diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp index 8e29bb745734b..5863af3f3b920 100644 --- a/clang/lib/Basic/Targets/LoongArch.cpp +++ b/clang/lib/Basic/Targets/LoongArch.cpp @@ -242,6 +242,7 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__loongarch_simd_width", "256"); Builder.defineMacro("__loongarch_sx", Twine(1)); Builder.defineMacro("__loongarch_asx", Twine(1)); + Builder.defineMacro("__loongarch_asx_sx_conv", Twine(1)); } else if (HasFeatureLSX) { Builder.defineMacro("__loongarch_simd_width", "128"); Builder.defineMacro("__loongarch_sx", Twine(1)); diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 334143d414d6d..7af25f7b0e664 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -402,18 +402,26 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: return Intrinsic::amdgcn_wave_reduce_add; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32: + return Intrinsic::amdgcn_wave_reduce_fadd; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: return Intrinsic::amdgcn_wave_reduce_sub; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32: + return Intrinsic::amdgcn_wave_reduce_fsub; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: return Intrinsic::amdgcn_wave_reduce_min; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32: + return Intrinsic::amdgcn_wave_reduce_fmin; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: return Intrinsic::amdgcn_wave_reduce_umin; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: return Intrinsic::amdgcn_wave_reduce_max; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32: + return Intrinsic::amdgcn_wave_reduce_fmax; case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: return Intrinsic::amdgcn_wave_reduce_umax; @@ -435,11 +443,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::SyncScope::ID SSID; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 19c42c88762fb..200ee13901f4b 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -5033,8 +5033,11 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, return true; // Space between import . // or import .....; - if (Left.is(Keywords.kw_import) && Right.isOneOf(tok::less, tok::ellipsis)) + if (Left.is(Keywords.kw_import) && + Right.isOneOf(tok::less, tok::ellipsis) && + (!BeforeLeft || BeforeLeft->is(tok::kw_export))) { return true; + } // Space between `module :` and `import :`. if (Left.isOneOf(Keywords.kw_module, Keywords.kw_import) && Right.is(TT_ModulePartitionColon)) { diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h index 85020d82829e2..83cc4288a990c 100644 --- a/clang/lib/Headers/lasxintrin.h +++ b/clang/lib/Headers/lasxintrin.h @@ -10,6 +10,8 @@ #ifndef _LOONGSON_ASXINTRIN_H #define _LOONGSON_ASXINTRIN_H 1 +#include + #if defined(__loongarch_asx) typedef signed char v32i8 __attribute__((vector_size(32), aligned(32))); @@ -3882,5 +3884,116 @@ extern __inline #define __lasx_xvrepli_w(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_w((_1))) +#if defined(__loongarch_asx_sx_conv) + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, + __artificial__)) __m256 __lasx_cast_128_s(__m128 _1) { + return (__m256)__builtin_lasx_cast_128_s((v4f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_cast_128_d(__m128d _1) { + return (__m256d)__builtin_lasx_cast_128_d((v2f64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i + __lasx_cast_128(__m128i _1) { + return (__m256i)__builtin_lasx_cast_128((v2i64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 + __lasx_concat_128_s(__m128 _1, __m128 _2) { + return (__m256)__builtin_lasx_concat_128_s((v4f32)_1, (v4f32)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_concat_128_d(__m128d _1, __m128d _2) { + return (__m256d)__builtin_lasx_concat_128_d((v2f64)_1, (v2f64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i + __lasx_concat_128(__m128i _1, __m128i _2) { + return (__m256i)__builtin_lasx_concat_128((v2i64)_1, (v2i64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 + __lasx_extract_128_lo_s(__m256 _1) { + return (__m128)__builtin_lasx_extract_128_lo_s((v8f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d + __lasx_extract_128_lo_d(__m256d _1) { + return (__m128d)__builtin_lasx_extract_128_lo_d((v4f64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i + __lasx_extract_128_lo(__m256i _1) { + return (__m128i)__builtin_lasx_extract_128_lo((v4i64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128 + __lasx_extract_128_hi_s(__m256 _1) { + return (__m128)__builtin_lasx_extract_128_hi_s((v8f32)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d + __lasx_extract_128_hi_d(__m256d _1) { + return (__m128d)__builtin_lasx_extract_128_hi_d((v4f64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i + __lasx_extract_128_hi(__m256i _1) { + return (__m128i)__builtin_lasx_extract_128_hi((v4i64)_1); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 + __lasx_insert_128_lo_s(__m256 _1, __m128 _2) { + return (__m256)__builtin_lasx_insert_128_lo_s((v8f32)_1, (v4f32)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_insert_128_lo_d(__m256d _1, __m128d _2) { + return (__m256d)__builtin_lasx_insert_128_lo_d((v4f64)_1, (v2f64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i + __lasx_insert_128_lo(__m256i _1, __m128i _2) { + return (__m256i)__builtin_lasx_insert_128_lo((v4i64)_1, (v2i64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256 + __lasx_insert_128_hi_s(__m256 _1, __m128 _2) { + return (__m256)__builtin_lasx_insert_128_hi_s((v8f32)_1, (v4f32)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d + __lasx_insert_128_hi_d(__m256d _1, __m128d _2) { + return (__m256d)__builtin_lasx_insert_128_hi_d((v4f64)_1, (v2f64)_2); +} + +extern __inline + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i + __lasx_insert_128_hi(__m256i _1, __m128i _2) { + return (__m256i)__builtin_lasx_insert_128_hi((v4i64)_1, (v2i64)_2); +} + +#endif /* defined(__loongarch_asx_sx_conv). */ #endif /* defined(__loongarch_asx). */ #endif /* _LOONGSON_ASXINTRIN_H. */ diff --git a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp index 019e81f91400d..027bf780273cc 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StackAddrEscapeChecker.cpp @@ -22,6 +22,7 @@ #include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h" #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/raw_ostream.h" using namespace clang; using namespace ento; @@ -247,6 +248,7 @@ class FindStackRegionsSymbolVisitor final : public SymbolVisitor { CheckerContext &Ctxt; const StackFrameContext *PoppedStackFrame; SmallVectorImpl &EscapingStackRegions; + llvm::SmallPtrSet VisitedRegions; public: explicit FindStackRegionsSymbolVisitor( @@ -258,6 +260,9 @@ class FindStackRegionsSymbolVisitor final : public SymbolVisitor { bool VisitSymbol(SymbolRef sym) override { return true; } bool VisitMemRegion(const MemRegion *MR) override { + if (!VisitedRegions.insert(MR).second) + return true; + SaveIfEscapes(MR); if (const BlockDataRegion *BDR = MR->getAs()) diff --git a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp index 6673f2f319c0e..aafd8d45537e3 100644 --- a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp +++ b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp @@ -202,11 +202,10 @@ SarifDiagnostics::createResult(const PathDiagnostic *Diag, // Find the HTML report that was generated for this issue, if one exists. PDFileEntry::ConsumerFiles *Files = FM->getFiles(*Diag); if (Files) { - auto HtmlFile = - std::find_if(Files->cbegin(), Files->cend(), [](auto &File) { - return File.first == HTML_DIAGNOSTICS_NAME; - }); - if (HtmlFile != Files->cend()) { + auto HtmlFile = llvm::find_if(*Files, [](const auto &File) { + return File.first == HTML_DIAGNOSTICS_NAME; + }); + if (HtmlFile != Files->end()) { SmallString<128> HtmlReportPath = llvm::sys::path::parent_path(OutputFile); llvm::sys::path::append(HtmlReportPath, HtmlFile->second); diff --git a/clang/test/AST/ByteCode/cxx23.cpp b/clang/test/AST/ByteCode/cxx23.cpp index c5d26925ce11b..819460628c1b7 100644 --- a/clang/test/AST/ByteCode/cxx23.cpp +++ b/clang/test/AST/ByteCode/cxx23.cpp @@ -473,3 +473,26 @@ namespace AIEWithIndex0Narrows { } static_assert(test()); } + +#if __cplusplus >= 202302L +namespace InactiveLocalsInConditionalOp { + struct A { constexpr A(){}; ~A(); constexpr int get() { return 10; } }; // all-note 2{{declared here}} + constexpr int get(bool b) { + return b ? A().get() : 1; // all-note {{non-constexpr function '~A' cannot be used in a constant expression}} + } + static_assert(get(false) == 1, ""); + static_assert(get(true) == 10, ""); // all-error {{not an integral constant expression}} \ + // all-note {{in call to}} + + static_assert( (false ? A().get() : 1) == 1); + static_assert( (true ? A().get() : 1) == 1); // all-error {{not an integral constant expression}} \ + // all-note {{non-constexpr function '~A' cannot be used in a constant expression}} + + constexpr bool test2(bool b) { + unsigned long __ms = b ? (const unsigned long &)0 : __ms; + return true; + } + static_assert(test2(true)); + +} +#endif diff --git a/clang/test/AST/ByteCode/intap.cpp b/clang/test/AST/ByteCode/intap.cpp index 05ab319bf16df..efb60cb0abffe 100644 --- a/clang/test/AST/ByteCode/intap.cpp +++ b/clang/test/AST/ByteCode/intap.cpp @@ -305,6 +305,46 @@ namespace UnderlyingInt128 { static_assert(foo() == 0, ""); // both-error {{not an integral constant expression}} \ // both-note {{in call to}} } + +namespace CompoundAssignOperators { + constexpr unsigned __int128 foo() { + long b = 10; + + b += (__int128)1; + b -= (__int128)1; + b *= (__int128)1; + b /= (__int128)1; + + b += (unsigned __int128)1; + b -= (unsigned __int128)1; + b *= (unsigned __int128)1; + b /= (unsigned __int128)1; + + __int128 i = 10; + i += (__int128)1; + i -= (__int128)1; + i *= (__int128)1; + i /= (__int128)1; + i += (unsigned __int128)1; + i -= (unsigned __int128)1; + i *= (unsigned __int128)1; + i /= (unsigned __int128)1; + + unsigned __int128 i2 = 10; + i2 += (__int128)1; + i2 -= (__int128)1; + i2 *= (__int128)1; + i2 /= (__int128)1; + i2 += (unsigned __int128)1; + i2 -= (unsigned __int128)1; + i2 *= (unsigned __int128)1; + i2 /= (unsigned __int128)1; + + return (int)b; + } + static_assert(foo() == 10); +} + #endif #endif diff --git a/clang/test/Analysis/stackaddrleak.c b/clang/test/Analysis/stackaddrleak.c index 95175996e8274..96bd4e4ea19e5 100644 --- a/clang/test/Analysis/stackaddrleak.c +++ b/clang/test/Analysis/stackaddrleak.c @@ -1,5 +1,5 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s -// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -x c++ -Wno-bool-conversion %s +// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker=core,unix.Malloc -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s +// RUN: %clang_analyze_cc1 -fblocks -analyzer-checker=core,unix.Malloc -verify -x c++ -Wno-bool-conversion %s typedef __INTPTR_TYPE__ intptr_t; char const *p; @@ -90,3 +90,14 @@ struct child_stack_context_s return_child_stack_context_field() { } return s; // expected-warning {{Address of stack memory associated with local variable 'a' returned to caller}} } + +// Returns an 'int' block taking an 'int'. +int (^copy_self_referencing_block(void))(int) { + // It is important that the 'fib' block captures itself. + __block int (^fib)(int) = ^(int n) { + if (n <= 1) return n; + return fib(n - 1) + fib(n - 2); + }; + return fib; // no-crash when copying a self-referencing 'fib' + // expected-warning-re@-1 {{Address of stack-allocated block declared on line {{[0-9]+}} is captured by a returned block}} +} diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c index 03a746c966cdd..4f289ca84c271 100644 --- a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c +++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c @@ -7120,6 +7120,177 @@ v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); } // CHECK-NEXT: ret void // v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); } +// CHECK-LABEL: define dso_local void @cast_128_s( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> [[TMP0]]) +// CHECK-NEXT: store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v8f32 cast_128_s(v4f32 _1) { return __lasx_cast_128_s(_1); } +// CHECK-LABEL: define dso_local void @cast_128_d( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> [[TMP0]]) +// CHECK-NEXT: store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4f64 cast_128_d(v2f64 _1) { return __lasx_cast_128_d(_1); } +// CHECK-LABEL: define dso_local void @cast_128( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> [[TMP0]]) +// CHECK-NEXT: store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4i64 cast_128(v2i64 _1) { return __lasx_cast_128(_1); } +// CHECK-LABEL: define dso_local void @concat_128_s( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) +// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v8f32 concat_128_s(v4f32 _1, v4f32 _2) { return __lasx_concat_128_s(_1, _2); } +// CHECK-LABEL: define dso_local void @concat_128_d( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) +// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4f64 concat_128_d(v2f64 _1, v2f64 _2) { return __lasx_concat_128_d(_1, _2); } +// CHECK-LABEL: define dso_local void @concat_128( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]]) +// CHECK-NEXT: store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4i64 concat_128(v2i64 _1, v2i64 _2) { return __lasx_concat_128(_1, _2); } +// CHECK-LABEL: define dso_local i128 @extract_128_lo_s( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v4f32 extract_128_lo_s(v8f32 _1) { return __lasx_extract_128_lo_s(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_lo_d( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v2f64 extract_128_lo_d(v4f64 _1) { return __lasx_extract_128_lo_d(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_lo( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v2i64 extract_128_lo(v4i64 _1) { return __lasx_extract_128_lo(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_hi_s( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v4f32 extract_128_hi_s(v8f32 _1) { return __lasx_extract_128_hi_s(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_hi_d( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v2f64 extract_128_hi_d(v4f64 _1) { return __lasx_extract_128_hi_d(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_hi( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v2i64 extract_128_hi(v4i64 _1) { return __lasx_extract_128_hi(_1); } +// CHECK-LABEL: define dso_local void @insert_128_lo_s( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> [[_1]], <4 x float> [[TMP1]]) +// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v8f32 insert_128_lo_s(v8f32 _1, v4f32 _2) { return __lasx_insert_128_lo_s(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_lo_d( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> [[_1]], <2 x double> [[TMP1]]) +// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4f64 insert_128_lo_d(v4f64 _1, v2f64 _2) { return __lasx_insert_128_lo_d(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_lo( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> [[_1]], <2 x i64> [[TMP1]]) +// CHECK-NEXT: store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4i64 insert_128_lo(v4i64 _1, v2i64 _2) { return __lasx_insert_128_lo(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_hi_s( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> [[_1]], <4 x float> [[TMP1]]) +// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v8f32 insert_128_hi_s(v8f32 _1, v4f32 _2) { return __lasx_insert_128_hi_s(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_hi_d( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> [[_1]], <2 x double> [[TMP1]]) +// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4f64 insert_128_hi_d(v4f64 _1, v2f64 _2) { return __lasx_insert_128_hi_d(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_hi( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> [[_1]], <2 x i64> [[TMP1]]) +// CHECK-NEXT: store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4i64 insert_128_hi(v4i64 _1, v2i64 _2) { return __lasx_insert_128_hi(_1, _2); } //. // CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0} // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"} diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin.c b/clang/test/CodeGen/LoongArch/lasx/builtin.c index 700e845cd662a..373efefc9b264 100644 --- a/clang/test/CodeGen/LoongArch/lasx/builtin.c +++ b/clang/test/CodeGen/LoongArch/lasx/builtin.c @@ -1,6 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -O2 -emit-llvm %s -o - | FileCheck %s +typedef long long v2i64 __attribute__ ((vector_size(16), aligned(16))); +typedef float v4f32 __attribute__((vector_size(16), aligned(16))); +typedef double v2f64 __attribute__((vector_size(16), aligned(16))); + typedef signed char v32i8 __attribute__((vector_size(32), aligned(32))); typedef signed char v32i8_b __attribute__((vector_size(32), aligned(1))); typedef unsigned char v32u8 __attribute__((vector_size(32), aligned(32))); @@ -7142,6 +7146,177 @@ v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); } // CHECK-NEXT: ret void // v8i32 xvrepli_w() { return __builtin_lasx_xvrepli_w(1); } +// CHECK-LABEL: define dso_local void @cast_128_s( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> [[TMP0]]) +// CHECK-NEXT: store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v8f32 cast_128_s(v4f32 _1) { return __builtin_lasx_cast_128_s(_1); } +// CHECK-LABEL: define dso_local void @cast_128_d( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> [[TMP0]]) +// CHECK-NEXT: store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4f64 cast_128_d(v2f64 _1) { return __builtin_lasx_cast_128_d(_1); } +// CHECK-LABEL: define dso_local void @cast_128( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> [[TMP0]]) +// CHECK-NEXT: store <4 x i64> [[TMP1]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4i64 cast_128(v2i64 _1) { return __builtin_lasx_cast_128(_1); } +// CHECK-LABEL: define dso_local void @concat_128_s( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) +// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v8f32 concat_128_s(v4f32 _1, v4f32 _2) { return __builtin_lasx_concat_128_s(_1, _2); } +// CHECK-LABEL: define dso_local void @concat_128_d( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) +// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4f64 concat_128_d(v2f64 _1, v2f64 _2) { return __builtin_lasx_concat_128_d(_1, _2); } +// CHECK-LABEL: define dso_local void @concat_128( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], i128 noundef [[_1_COERCE:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]]) +// CHECK-NEXT: store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4i64 concat_128(v2i64 _1, v2i64 _2) { return __builtin_lasx_concat_128(_1, _2); } +// CHECK-LABEL: define dso_local i128 @extract_128_lo_s( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v4f32 extract_128_lo_s(v8f32 _1) { return __builtin_lasx_extract_128_lo_s(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_lo_d( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v2f64 extract_128_lo_d(v4f64 _1) { return __builtin_lasx_extract_128_lo_d(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_lo( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v2i64 extract_128_lo(v4i64 _1) { return __builtin_lasx_extract_128_lo(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_hi_s( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v4f32 extract_128_hi_s(v8f32 _1) { return __builtin_lasx_extract_128_hi_s(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_hi_d( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v2f64 extract_128_hi_d(v4f64 _1) { return __builtin_lasx_extract_128_hi_d(_1); } +// CHECK-LABEL: define dso_local i128 @extract_128_hi( +// CHECK-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR7]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> [[_1]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 +// CHECK-NEXT: ret i128 [[TMP2]] +// +v2i64 extract_128_hi(v4i64 _1) { return __builtin_lasx_extract_128_hi(_1); } +// CHECK-LABEL: define dso_local void @insert_128_lo_s( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> [[_1]], <4 x float> [[TMP1]]) +// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v8f32 insert_128_lo_s(v8f32 _1, v4f32 _2) { return __builtin_lasx_insert_128_lo_s(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_lo_d( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> [[_1]], <2 x double> [[TMP1]]) +// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4f64 insert_128_lo_d(v4f64 _1, v2f64 _2) { return __builtin_lasx_insert_128_lo_d(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_lo( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> [[_1]], <2 x i64> [[TMP1]]) +// CHECK-NEXT: store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4i64 insert_128_lo(v4i64 _1, v2i64 _2) { return __builtin_lasx_insert_128_lo(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_hi_s( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<8 x float>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> [[_1]], <4 x float> [[TMP1]]) +// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v8f32 insert_128_hi_s(v8f32 _1, v4f32 _2) { return __builtin_lasx_insert_128_hi_s(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_hi_d( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> [[_1]], <2 x double> [[TMP1]]) +// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4f64 insert_128_hi_d(v4f64 _1, v2f64 _2) { return __builtin_lasx_insert_128_hi_d(_1, _2); } +// CHECK-LABEL: define dso_local void @insert_128_hi( +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 32 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], i128 noundef [[_2_COERCE:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[_1:%.*]] = load <4 x i64>, ptr [[TMP0]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> [[_1]], <2 x i64> [[TMP1]]) +// CHECK-NEXT: store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32, !tbaa [[CHAR_TBAA6]] +// CHECK-NEXT: ret void +// +v4i64 insert_128_hi(v4i64 _1, v2i64 _2) { return __builtin_lasx_insert_128_hi(_1, _2); } //. // CHECK: [[META4:![0-9]+]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0} // CHECK: [[META5]] = !{!"Simple C/C++ TBAA"} diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index d22f2f8be8be3..13ad0545ab53f 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -305,12 +305,16 @@ __m256i test_mm256_bslli_epi128(__m256i a) { // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> return _mm256_bslli_epi128(a, 3); } +TEST_CONSTEXPR(match_v32qi(_mm256_bslli_epi128(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 3), 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)); +TEST_CONSTEXPR(match_v32qi(_mm256_bslli_epi128(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m256i test_mm256_bsrli_epi128(__m256i a) { // CHECK-LABEL: test_mm256_bsrli_epi128 // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> return _mm256_bsrli_epi128(a, 3); } +TEST_CONSTEXPR(match_v32qi(_mm256_bsrli_epi128(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 3), 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 0, 0, 0)); +TEST_CONSTEXPR(match_v32qi(_mm256_bsrli_epi128(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_cmpeq_epi8 diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index b92454de60c78..a5132c9114673 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -405,6 +405,13 @@ void test_wave_reduce_add_u64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fadd_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32( +void test_wave_reduce_fadd_f32_default(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_add_u32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( void test_wave_reduce_add_u32_iterative(global int* out, int in) @@ -419,6 +426,13 @@ void test_wave_reduce_add_u64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 1); } +// CHECK-LABEL: @test_wave_reduce_fadd_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32( +void test_wave_reduce_fadd_f32_iterative(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_add_u32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( void test_wave_reduce_add_u32_dpp(global int* out, int in) @@ -433,6 +447,13 @@ void test_wave_reduce_add_u64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_add_u64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_fadd_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32( +void test_wave_reduce_fadd_f32_dpp(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_default(global int* out, int in) @@ -447,6 +468,13 @@ void test_wave_reduce_sub_u64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fsub_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32( +void test_wave_reduce_fsub_f32_default(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_iterative(global int* out, int in) @@ -461,6 +489,13 @@ void test_wave_reduce_sub_u64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1); } +// CHECK-LABEL: @test_wave_reduce_fsub_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32( +void test_wave_reduce_fsub_f32_iterative(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_sub_u32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( void test_wave_reduce_sub_u32_dpp(global int* out, int in) @@ -475,6 +510,13 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_fsub_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32( +void test_wave_reduce_fsub_f32_dpp(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_and_b32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( void test_wave_reduce_and_b32_default(global int* out, int in) @@ -615,6 +657,13 @@ void test_wave_reduce_min_i64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fmin_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32( +void test_wave_reduce_fmin_f32_default(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_i32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( void test_wave_reduce_min_i32_iterative(global int* out, int in) @@ -629,6 +678,13 @@ void test_wave_reduce_min_i64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1); } +// CHECK-LABEL: @test_wave_reduce_fmin_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32( +void test_wave_reduce_fmin_f32_iterative(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_i32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( void test_wave_reduce_min_i32_dpp(global int* out, int in) @@ -643,6 +699,13 @@ void test_wave_reduce_min_i64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_fmin_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32( +void test_wave_reduce_fmin_f32_dpp(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_min_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( void test_wave_reduce_min_u32_default(global int* out, int in) @@ -699,6 +762,13 @@ void test_wave_reduce_max_i64_default(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0); } +// CHECK-LABEL: @test_wave_reduce_fmax_f32_default +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32( +void test_wave_reduce_fmax_f32_default(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_i32_iterative // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( void test_wave_reduce_max_i32_iterative(global int* out, int in) @@ -713,6 +783,13 @@ void test_wave_reduce_max_i64_iterative(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1); } +// CHECK-LABEL: @test_wave_reduce_fmax_f32_iterative +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32( +void test_wave_reduce_fmax_f32_iterative(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_i32_dpp // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( void test_wave_reduce_max_i32_dpp(global int* out, int in) @@ -727,6 +804,13 @@ void test_wave_reduce_max_i64_dpp(global int* out, long in) *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2); } +// CHECK-LABEL: @test_wave_reduce_fmax_f32_dpp +// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32( +void test_wave_reduce_fmax_f32_dpp(global float* out, float in) +{ + *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0); +} + // CHECK-LABEL: @test_wave_reduce_max_u32_default // CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( void test_wave_reduce_max_u32_default(global int* out, int in) diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c index fd7ce2073a512..b2c0b51464a80 100644 --- a/clang/test/Preprocessor/init-loongarch.c +++ b/clang/test/Preprocessor/init-loongarch.c @@ -925,6 +925,7 @@ // RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \ // RUN: | FileCheck --match-full-lines --check-prefix=MLSX %s // MLSX-NOT: #define __loongarch_asx +// MLSX-NOT: #define __loongarch_asx_sx_conv // MLSX: #define __loongarch_simd_width 128 // MLSX: #define __loongarch_sx 1 @@ -937,6 +938,7 @@ // RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \ // RUN: | FileCheck --match-full-lines --check-prefix=MLASX %s // MLASX: #define __loongarch_asx 1 +// MLASX: #define __loongarch_asx_sx_conv 1 // MLASX: #define __loongarch_simd_width 256 // MLASX: #define __loongarch_sx 1 @@ -953,5 +955,6 @@ // RUN: %clang --target=loongarch64 -mno-lsx -march=la464 -x c -E -dM %s -o - \ // RUN: | FileCheck --match-full-lines --check-prefix=MNO-LSX %s // MNO-LSX-NOT: #define __loongarch_asx +// MNO-LSX-NOT: #define __loongarch_asx_sx_conv // MNO-LSX-NOT: #define __loongarch_simd_width // MNO-LSX-NOT: #define __loongarch_sx diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index 3cab4c600b1b1..164790606ea0b 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -3204,6 +3204,57 @@ TEST_P(ImportExpr, UnresolvedMemberExpr) { compoundStmt(has(callExpr(has(unresolvedMemberExpr()))))))))); } +TEST_P(ImportDecl, CycleInAutoTemplateSpec) { + MatchVerifier Verifier; + const char *Code = R"( + template + struct basic_string { + using value_type = _CharT; + }; + + template + struct basic_string_view { + using value_type = T; + }; + + using string_view = basic_string_view; + using string = basic_string; + + template + struct span { + }; + + template + auto StrCatT(span pieces) { + basic_string result; + return result; + } + + string StrCat(span pieces) { + return StrCatT(pieces); + } + + string StrCat(span pieces) { + return StrCatT(pieces); + } + + template + auto declToImport(T pieces) { + return StrCat(pieces); + } + + void test() { + span pieces; + auto result = declToImport(pieces); + } +)"; + // This test reproduces the StrCatT recursion pattern with concepts and span + // that may cause infinite recursion during AST import due to circular + // dependencies + testImport(Code, Lang_CXX20, "", Lang_CXX20, Verifier, + functionTemplateDecl(hasName("declToImport"))); +} + TEST_P(ImportExpr, ConceptNoRequirement) { MatchVerifier Verifier; const char *Code = R"( diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 81fa7d1d11aa4..5a5d77075bb3a 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -27328,6 +27328,7 @@ TEST_F(FormatTest, Cpp20ModulesSupport) { verifyFormat("export", Style); verifyFormat("import /* not keyword */ = val ? 2 : 1;"); + verifyFormat("_world->import();"); } TEST_F(FormatTest, CoroutineForCoawait) { diff --git a/libcxx/include/__algorithm/all_of.h b/libcxx/include/__algorithm/all_of.h index 6acc117fc47bc..9bdb20a0d7b2f 100644 --- a/libcxx/include/__algorithm/all_of.h +++ b/libcxx/include/__algorithm/all_of.h @@ -10,24 +10,28 @@ #ifndef _LIBCPP___ALGORITHM_ALL_OF_H #define _LIBCPP___ALGORITHM_ALL_OF_H +#include <__algorithm/any_of.h> #include <__config> #include <__functional/identity.h> #include <__type_traits/invoke.h> +#include <__utility/forward.h> +#include <__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header #endif +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 bool __all_of(_Iter __first, _Sent __last, _Pred& __pred, _Proj& __proj) { - for (; __first != __last; ++__first) { - if (!std::__invoke(__pred, std::__invoke(__proj, *__first))) - return false; - } - return true; + using _Ref = decltype(std::__invoke(__proj, *__first)); + auto __negated_pred = [&__pred](_Ref __arg) -> bool { return !std::__invoke(__pred, std::forward<_Ref>(__arg)); }; + return !std::__any_of(std::move(__first), std::move(__last), __negated_pred, __proj); } template @@ -39,4 +43,6 @@ all_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) { _LIBCPP_END_NAMESPACE_STD +_LIBCPP_POP_MACROS + #endif // _LIBCPP___ALGORITHM_ALL_OF_H diff --git a/libcxx/include/__algorithm/none_of.h b/libcxx/include/__algorithm/none_of.h index e6bd197622292..1e1c8d1aad637 100644 --- a/libcxx/include/__algorithm/none_of.h +++ b/libcxx/include/__algorithm/none_of.h @@ -10,7 +10,9 @@ #ifndef _LIBCPP___ALGORITHM_NONE_OF_H #define _LIBCPP___ALGORITHM_NONE_OF_H +#include <__algorithm/any_of.h> #include <__config> +#include <__functional/identity.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -21,10 +23,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD template [[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool none_of(_InputIterator __first, _InputIterator __last, _Predicate __pred) { - for (; __first != __last; ++__first) - if (__pred(*__first)) - return false; - return true; + __identity __proj; + return !std::__any_of(__first, __last, __pred, __proj); } _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/array b/libcxx/include/array index ff46838e2e8e2..0b0c85458999c 100644 --- a/libcxx/include/array +++ b/libcxx/include/array @@ -210,28 +210,28 @@ struct array { } // iterators: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT { # if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY) return std::__make_static_bounded_iter<_Size>(data(), data()); # else return iterator(data()); # endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT { # if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY) return std::__make_static_bounded_iter<_Size>(data(), data()); # else return const_iterator(data()); # endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT { # if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY) return std::__make_static_bounded_iter<_Size>(data() + _Size, data()); # else return iterator(data() + _Size); # endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT { # if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY) return std::__make_static_bounded_iter<_Size>(data() + _Size, data()); # else @@ -239,62 +239,81 @@ struct array { # endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator + rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT { return begin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT { return end(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT { + return begin(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT { + return end(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator + crbegin() const _NOEXCEPT { return rbegin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT { + return rend(); + } // capacity: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return _Size; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return _Size; } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return _Size; } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return _Size; } [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT { return _Size == 0; } // element access: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type __n) _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type __n) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < _Size, "out-of-bounds access in std::array"); return __elems_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference operator[](size_type __n) const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference + operator[](size_type __n) const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < _Size, "out-of-bounds access in std::array"); return __elems_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type __n) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type __n) { if (__n >= _Size) std::__throw_out_of_range("array::at"); return __elems_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type __n) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type __n) const { if (__n >= _Size) std::__throw_out_of_range("array::at"); return __elems_[__n]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT { return (*this)[0]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT { return (*this)[0]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT { return (*this)[_Size - 1]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT { + return (*this)[0]; + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT { + return (*this)[0]; + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT { + return (*this)[_Size - 1]; + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT { return (*this)[_Size - 1]; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { return __elems_; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT { return __elems_; } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { + return __elems_; + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT { + return __elems_; + } }; template @@ -328,8 +347,10 @@ struct array<_Tp, 0> { }; _ALIGNAS_TYPE(_ArrayInStructT) _EmptyType __elems_[sizeof(_ArrayInStructT)]; - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { return nullptr; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT { return nullptr; } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 value_type* data() _NOEXCEPT { return nullptr; } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const value_type* data() const _NOEXCEPT { + return nullptr; + } // No explicit construct/copy/destroy for aggregate type _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void fill(const value_type&) { @@ -341,28 +362,28 @@ struct array<_Tp, 0> { } // iterators: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator begin() _NOEXCEPT { # if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY) return std::__make_static_bounded_iter<0>(data(), data()); # else return iterator(data()); # endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator begin() const _NOEXCEPT { # if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY) return std::__make_static_bounded_iter<0>(data(), data()); # else return const_iterator(data()); # endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 iterator end() _NOEXCEPT { # if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY) return std::__make_static_bounded_iter<0>(data(), data()); # else return iterator(data()); # endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator end() const _NOEXCEPT { # if defined(_LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY) return std::__make_static_bounded_iter<0>(data(), data()); # else @@ -370,68 +391,77 @@ struct array<_Tp, 0> { # endif } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator + rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT { return begin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT { return end(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cbegin() const _NOEXCEPT { + return begin(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_iterator cend() const _NOEXCEPT { + return end(); + } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator + crbegin() const _NOEXCEPT { return rbegin(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT { return rend(); } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 const_reverse_iterator crend() const _NOEXCEPT { + return rend(); + } // capacity: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return 0; } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return 0; } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type size() const _NOEXCEPT { return 0; } + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR size_type max_size() const _NOEXCEPT { return 0; } [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR bool empty() const _NOEXCEPT { return true; } // element access: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type) _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference operator[](size_type) _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::operator[] on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference operator[](size_type) const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference + operator[](size_type) const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::operator[] on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type) { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference at(size_type) { std::__throw_out_of_range("array::at"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type) const { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference at(size_type) const { std::__throw_out_of_range("array::at"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference front() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::front() on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference front() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::front() on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 reference back() _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::back() on a zero-sized array"); __libcpp_unreachable(); } - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const_reference back() const _NOEXCEPT { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(false, "cannot call array::back() on a zero-sized array"); __libcpp_unreachable(); } @@ -501,25 +531,29 @@ struct tuple_element<_Ip, array<_Tp, _Size> > { }; template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& get(array<_Tp, _Size>& __a) _NOEXCEPT { +[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp& +get(array<_Tp, _Size>& __a) _NOEXCEPT { static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::array)"); return __a.__elems_[_Ip]; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& get(const array<_Tp, _Size>& __a) _NOEXCEPT { +[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp& +get(const array<_Tp, _Size>& __a) _NOEXCEPT { static_assert(_Ip < _Size, "Index out of bounds in std::get<> (const std::array)"); return __a.__elems_[_Ip]; } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&& get(array<_Tp, _Size>&& __a) _NOEXCEPT { +[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp&& +get(array<_Tp, _Size>&& __a) _NOEXCEPT { static_assert(_Ip < _Size, "Index out of bounds in std::get<> (std::array &&)"); return std::move(__a.__elems_[_Ip]); } template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& get(const array<_Tp, _Size>&& __a) _NOEXCEPT { +[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 const _Tp&& +get(const array<_Tp, _Size>&& __a) _NOEXCEPT { static_assert(_Ip < _Size, "Index out of bounds in std::get<> (const std::array &&)"); return std::move(__a.__elems_[_Ip]); } @@ -539,7 +573,7 @@ __to_array_rvalue_impl(_Tp (&&__arr)[_Size], index_sequence<_Index...>) { } template -_LIBCPP_HIDE_FROM_ABI constexpr array, _Size> +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr array, _Size> to_array(_Tp (&__arr)[_Size]) noexcept(is_nothrow_constructible_v<_Tp, _Tp&>) { static_assert(!is_array_v<_Tp>, "[array.creation]/1: to_array does not accept multidimensional arrays."); static_assert(is_constructible_v<_Tp, _Tp&>, "[array.creation]/1: to_array requires copy constructible elements."); @@ -547,7 +581,7 @@ to_array(_Tp (&__arr)[_Size]) noexcept(is_nothrow_constructible_v<_Tp, _Tp&>) { } template -_LIBCPP_HIDE_FROM_ABI constexpr array, _Size> +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr array, _Size> to_array(_Tp (&&__arr)[_Size]) noexcept(is_nothrow_move_constructible_v<_Tp>) { static_assert(!is_array_v<_Tp>, "[array.creation]/4: to_array does not accept multidimensional arrays."); static_assert(is_move_constructible_v<_Tp>, "[array.creation]/4: to_array requires move constructible elements."); diff --git a/libcxx/include/list b/libcxx/include/list index 2898a45da0029..a5c84cad51489 100644 --- a/libcxx/include/list +++ b/libcxx/include/list @@ -774,57 +774,71 @@ public: _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __x); - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT; + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT; - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return this->__size_; } + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { + return this->__size_; + } [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __base::empty(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return std::min(this->__node_alloc_max_size(), numeric_limits::max()); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return __base::begin(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return __base::begin(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return __base::end(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return __base::end(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { + return __base::begin(); + } + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { + return __base::begin(); + } + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { + return __base::end(); + } + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { + return __base::end(); + } + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return __base::begin(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return __base::end(); } + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { + return __base::end(); + } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() _NOEXCEPT { return reverse_iterator(end()); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator + rbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() _NOEXCEPT { return reverse_iterator(begin()); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator + crbegin() const _NOEXCEPT { return const_reverse_iterator(end()); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const _NOEXCEPT { return const_reverse_iterator(begin()); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list"); return __base::__end_.__next_->__as_node()->__get_value(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::front called on empty list"); return __base::__end_.__next_->__as_node()->__get_value(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference back() { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference back() { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list"); return __base::__end_.__prev_->__as_node()->__get_value(); } - _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference back() const { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference back() const { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!empty(), "list::back called on empty list"); return __base::__end_.__prev_->__as_node()->__get_value(); } diff --git a/libcxx/include/string b/libcxx/include/string index c4806069d0b44..6b42cb2c7586d 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -2384,6 +2384,19 @@ private: return __guess; } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + __get_amortized_growth_capacity(size_type __required_capacity) { + size_type __max_size = max_size(); + if (__required_capacity > __max_size) + __throw_length_error(); + size_type __current_cap = capacity(); + _LIBCPP_ASSERT_INTERNAL( + __current_cap < __required_capacity, "Trying to grow string even though there is enough capacity already?"); + if (__current_cap > __max_size / 2 - __alignment) + return __max_size; + return std::max(__required_capacity, 2 * __current_cap); + } + inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(const value_type* __s, size_type __sz); inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void __init(size_type __n, value_type __c); @@ -2714,14 +2727,10 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__ size_type __n_del, size_type __n_add, const value_type* __p_new_stuff) { - size_type __ms = max_size(); - if (__delta_cap > __ms - __old_cap) - __throw_length_error(); + __long __buffer = __allocate_long_buffer(__alloc_, __get_amortized_growth_capacity(__old_cap + __delta_cap)); pointer __old_p = __get_pointer(); - size_type __cap = __old_cap < __ms / 2 - __alignment ? std::max(__old_cap + __delta_cap, 2 * __old_cap) : __ms; __annotate_delete(); - auto __guard = std::__make_scope_guard(__annotate_new_size(*this)); - __long __buffer = __allocate_long_buffer(__alloc_, __cap); + auto __guard = std::__make_scope_guard(__annotate_new_size(*this)); if (__n_copy != 0) traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__old_p), __n_copy); if (__n_add != 0) @@ -2751,12 +2760,8 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait size_type __n_copy, size_type __n_del, size_type __n_add) { - size_type __ms = max_size(); - if (__delta_cap > __ms - __old_cap) - this->__throw_length_error(); + __long __buffer = __allocate_long_buffer(__alloc_, __get_amortized_growth_capacity(__old_cap + __delta_cap)); pointer __old_p = __get_pointer(); - size_type __cap = __old_cap < __ms / 2 - __alignment ? std::max(__old_cap + __delta_cap, 2 * __old_cap) : __ms; - __long __buffer = __allocate_long_buffer(__alloc_, __cap); if (__n_copy != 0) traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__old_p), __n_copy); size_type __sec_cp_sz = __old_sz - __n_del - __n_copy; diff --git a/libcxx/test/libcxx/diagnostics/array.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/array.nodiscard.verify.cpp index 25a2f80b48f02..8e49807732de7 100644 --- a/libcxx/test/libcxx/diagnostics/array.nodiscard.verify.cpp +++ b/libcxx/test/libcxx/diagnostics/array.nodiscard.verify.cpp @@ -11,13 +11,70 @@ // check that functions are marked [[nodiscard]] #include +#include -void array_test() { - std::array array; - array.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +#include + +template +void test_members() { + std::array a; + const std::array ca{}; + + a.begin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.begin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.end(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.end(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.rbegin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.rbegin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.rend(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.rend(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.cbegin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.cbegin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.cend(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.cend(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.crbegin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.crbegin(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.crend(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.crend(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + + a.size(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.max_size(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.empty(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + + a[0]; // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca[0]; // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.at(0); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.at(0); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + + a.front(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.front(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + a.back(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.back(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + + a.data(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} + ca.data(); // expected-warning 2 {{ignoring return value of function declared with 'nodiscard' attribute}} +} + +template +void test_get() { + std::array a{}; + + // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}} + std::get<0>(a); + // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}} + std::get<0>(std::move(a)); +} + +#if TEST_STD_VER >= 20 +void test_to_array() { + // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}} + std::to_array("zmt"); + // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}} + std::to_array({94, 82, 49}); } +#endif -void empty_array_test() { - std::array array; - array.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +void test() { + test_members<0>(); + test_members<82>(); } diff --git a/libcxx/test/libcxx/diagnostics/list.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/list.nodiscard.verify.cpp index f19224a71f5cc..bfce9b85ef76c 100644 --- a/libcxx/test/libcxx/diagnostics/list.nodiscard.verify.cpp +++ b/libcxx/test/libcxx/diagnostics/list.nodiscard.verify.cpp @@ -13,6 +13,33 @@ #include void test() { - std::list list; - list.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + std::list l; + const std::list cl; + + l.get_allocator(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.size(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.max_size(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + l.begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.begin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.end(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.end(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.cbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.cbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.cend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.cend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.rbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.rbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.rend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.rend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.crbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.crbegin(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.crend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.crend(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + + l.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.front(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + l.back(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} + cl.back(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} } diff --git a/libcxx/test/std/algorithms/robust_against_nonbool.compile.pass.cpp b/libcxx/test/std/algorithms/robust_against_nonbool.compile.pass.cpp new file mode 100644 index 0000000000000..e7c32d244a565 --- /dev/null +++ b/libcxx/test/std/algorithms/robust_against_nonbool.compile.pass.cpp @@ -0,0 +1,136 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17 + +// +// +// Algorithms that take predicates should support predicates that return a non-boolean value as long as the +// returned type is implicitly convertible to bool. + +#include + +#include + +#include "boolean_testable.h" + +using Value = StrictComparable; +using Iterator = StrictBooleanIterator; +auto pred1 = StrictUnaryPredicate; +auto pred2 = StrictBinaryPredicate; + +void f(Iterator it, Iterator out, std::size_t n, Value const& val, std::initializer_list ilist) { + (void)std::any_of(it, it, pred1); + (void)std::all_of(it, it, pred1); + (void)std::none_of(it, it, pred1); + (void)std::find_if(it, it, pred1); + (void)std::find_if_not(it, it, pred1); + (void)std::find_first_of(it, it, it, it); + (void)std::find_first_of(it, it, it, it, pred2); + (void)std::adjacent_find(it, it); + (void)std::adjacent_find(it, it, pred2); + (void)std::mismatch(it, it, it, it); + (void)std::mismatch(it, it, it, it, pred2); + (void)std::mismatch(it, it, it); + (void)std::mismatch(it, it, it); + (void)std::mismatch(it, it, it, pred2); + (void)std::equal(it, it, it, it); + (void)std::equal(it, it, it, it, pred2); + (void)std::equal(it, it, it); + (void)std::equal(it, it, it, pred2); + (void)std::lexicographical_compare(it, it, it, it); + (void)std::lexicographical_compare(it, it, it, it, pred2); + (void)std::partition_point(it, it, pred1); + (void)std::lower_bound(it, it, val); + (void)std::lower_bound(it, it, val, pred2); + (void)std::upper_bound(it, it, val); + (void)std::upper_bound(it, it, val, pred2); + (void)std::equal_range(it, it, val); + (void)std::equal_range(it, it, val, pred2); + (void)std::binary_search(it, it, val); + (void)std::binary_search(it, it, val, pred2); + (void)std::min(val, val); + (void)std::min(val, val, pred2); + (void)std::min(ilist); + (void)std::min(ilist, pred2); + (void)std::max(val, val); + (void)std::max(val, val, pred2); + (void)std::max(ilist); + (void)std::max(ilist, pred2); + (void)std::minmax(val, val); + (void)std::minmax(val, val, pred2); + (void)std::minmax(ilist); + (void)std::minmax(ilist, pred2); + (void)std::min_element(it, it); + (void)std::min_element(it, it, pred2); + (void)std::max_element(it, it); + (void)std::max_element(it, it, pred2); + (void)std::minmax_element(it, it); + (void)std::minmax_element(it, it, pred2); + (void)std::count_if(it, it, pred1); + (void)std::search(it, it, it, it); + (void)std::search(it, it, it, it, pred2); + (void)std::search_n(it, it, n, val); + (void)std::search_n(it, it, n, val, pred2); + (void)std::is_partitioned(it, it, pred1); + (void)std::is_sorted(it, it); + (void)std::is_sorted(it, it, pred2); + (void)std::is_sorted_until(it, it); + (void)std::is_sorted_until(it, it, pred2); + (void)std::is_heap(it, it); + (void)std::is_heap(it, it, pred2); + (void)std::is_heap_until(it, it); + (void)std::is_heap_until(it, it, pred2); + (void)std::clamp(val, val, val); + (void)std::clamp(val, val, val, pred2); + (void)std::is_permutation(it, it, it, it); + (void)std::is_permutation(it, it, it, it, pred2); + (void)std::copy_if(it, it, out, pred1); + (void)std::remove_copy_if(it, it, out, pred1); + (void)std::remove_copy(it, it, out, val); + (void)std::replace(it, it, val, val); + (void)std::replace_if(it, it, pred1, val); + (void)std::replace_copy_if(it, it, out, pred1, val); + (void)std::replace_copy(it, it, out, val, val); + (void)std::unique_copy(it, it, out, pred2); + (void)std::partition_copy(it, it, out, out, pred1); + (void)std::partial_sort_copy(it, it, it, it, pred2); + (void)std::merge(it, it, it, it, out); + (void)std::merge(it, it, it, it, out, pred2); + (void)std::set_difference(it, it, it, it, out, pred2); + (void)std::set_intersection(it, it, it, it, out, pred2); + (void)std::set_symmetric_difference(it, it, it, it, out, pred2); + (void)std::set_union(it, it, it, it, out, pred2); + (void)std::remove_if(it, it, pred1); + (void)std::remove(it, it, val); + (void)std::unique(it, it, pred2); + (void)std::partition(it, it, pred1); + (void)std::stable_partition(it, it, pred1); + (void)std::sort(it, it); + (void)std::sort(it, it, pred2); + (void)std::stable_sort(it, it); + (void)std::stable_sort(it, it, pred2); + (void)std::partial_sort(it, it, it); + (void)std::partial_sort(it, it, it, pred2); + (void)std::nth_element(it, it, it); + (void)std::nth_element(it, it, it, pred2); + (void)std::inplace_merge(it, it, it); + (void)std::inplace_merge(it, it, it, pred2); + (void)std::make_heap(it, it); + (void)std::make_heap(it, it, pred2); + (void)std::push_heap(it, it); + (void)std::push_heap(it, it, pred2); + (void)std::pop_heap(it, it); + (void)std::pop_heap(it, it, pred2); + (void)std::sort_heap(it, it); + (void)std::sort_heap(it, it, pred2); + (void)std::prev_permutation(it, it); + (void)std::prev_permutation(it, it, pred2); + (void)std::next_permutation(it, it); + (void)std::next_permutation(it, it, pred2); +} diff --git a/libcxx/test/std/containers/sequences/array/array.creation/to_array.verify.cpp b/libcxx/test/std/containers/sequences/array/array.creation/to_array.verify.cpp index e3efef988f0f4..ee8d8e6cfb2e5 100644 --- a/libcxx/test/std/containers/sequences/array/array.creation/to_array.verify.cpp +++ b/libcxx/test/std/containers/sequences/array/array.creation/to_array.verify.cpp @@ -22,21 +22,21 @@ int main(int, char**) { // expected-error@array:* {{to_array does not accept multidimensional arrays}} // expected-error@array:* {{to_array requires copy constructible elements}} // expected-error@array:* 3 {{cannot initialize}} - std::to_array(source); // expected-note {{requested here}} + (void)std::to_array(source); // expected-note {{requested here}} } { MoveOnly mo[] = {MoveOnly{3}}; // expected-error@array:* {{to_array requires copy constructible elements}} // expected-error-re@array:* 1-2{{{{(call to implicitly-deleted copy constructor of 'MoveOnly')|(call to deleted constructor of 'MoveOnly')}}}} - std::to_array(mo); // expected-note {{requested here}} + (void)std::to_array(mo); // expected-note {{requested here}} } { const MoveOnly cmo[] = {MoveOnly{3}}; // expected-error@array:* {{to_array requires move constructible elements}} // expected-error-re@array:* 0-1{{{{(call to implicitly-deleted copy constructor of 'MoveOnly')|(call to deleted constructor of 'MoveOnly')}}}} - std::to_array(std::move(cmo)); // expected-note {{requested here}} + (void)std::to_array(std::move(cmo)); // expected-note {{requested here}} } return 0; diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml index 2ac69c38ebffa..8b77a06323e3d 100644 --- a/libcxx/utils/ci/buildkite-pipeline.yml +++ b/libcxx/utils/ci/buildkite-pipeline.yml @@ -33,63 +33,64 @@ definitions: - "**/CMakeOutput.log" steps: -- group: ARM - steps: - - label: AArch64 - command: libcxx/utils/ci/run-buildbot aarch64 - agents: - queue: libcxx-builders-linaro-arm - arch: aarch64 - <<: *common - - - label: AArch64 -fno-exceptions - command: libcxx/utils/ci/run-buildbot aarch64-no-exceptions - agents: - queue: libcxx-builders-linaro-arm - arch: aarch64 - <<: *common - - - label: Armv8 - command: libcxx/utils/ci/run-buildbot armv8 - agents: - queue: libcxx-builders-linaro-arm - arch: armv8l - <<: *common - - - label: Armv8 -fno-exceptions - command: libcxx/utils/ci/run-buildbot armv8-no-exceptions - agents: - queue: libcxx-builders-linaro-arm - arch: armv8l - <<: *common - - - label: Armv7 - command: libcxx/utils/ci/run-buildbot armv7 - agents: - queue: libcxx-builders-linaro-arm - arch: armv8l - <<: *common - - - label: Armv7 -fno-exceptions - command: libcxx/utils/ci/run-buildbot armv7-no-exceptions - agents: - queue: libcxx-builders-linaro-arm - arch: armv8l - <<: *common - - - label: Armv7-M picolibc - command: libcxx/utils/ci/run-buildbot armv7m-picolibc - agents: - queue: libcxx-builders-linaro-arm - arch: aarch64 - <<: *common - - - label: Armv7-M picolibc -fno-exceptions - command: libcxx/utils/ci/run-buildbot armv7m-picolibc-no-exceptions - agents: - queue: libcxx-builders-linaro-arm - arch: aarch64 - <<: *common +# Linaro's ARM builders are temporarily offline. +#- group: ARM +# steps: +# - label: AArch64 +# command: libcxx/utils/ci/run-buildbot aarch64 +# agents: +# queue: libcxx-builders-linaro-arm +# arch: aarch64 +# <<: *common +# +# - label: AArch64 -fno-exceptions +# command: libcxx/utils/ci/run-buildbot aarch64-no-exceptions +# agents: +# queue: libcxx-builders-linaro-arm +# arch: aarch64 +# <<: *common +# +# - label: Armv8 +# command: libcxx/utils/ci/run-buildbot armv8 +# agents: +# queue: libcxx-builders-linaro-arm +# arch: armv8l +# <<: *common +# +# - label: Armv8 -fno-exceptions +# command: libcxx/utils/ci/run-buildbot armv8-no-exceptions +# agents: +# queue: libcxx-builders-linaro-arm +# arch: armv8l +# <<: *common +# +# - label: Armv7 +# command: libcxx/utils/ci/run-buildbot armv7 +# agents: +# queue: libcxx-builders-linaro-arm +# arch: armv8l +# <<: *common +# +# - label: Armv7 -fno-exceptions +# command: libcxx/utils/ci/run-buildbot armv7-no-exceptions +# agents: +# queue: libcxx-builders-linaro-arm +# arch: armv8l +# <<: *common +# +# - label: Armv7-M picolibc +# command: libcxx/utils/ci/run-buildbot armv7m-picolibc +# agents: +# queue: libcxx-builders-linaro-arm +# arch: aarch64 +# <<: *common +# +# - label: Armv7-M picolibc -fno-exceptions +# command: libcxx/utils/ci/run-buildbot armv7m-picolibc-no-exceptions +# agents: +# queue: libcxx-builders-linaro-arm +# arch: aarch64 +# <<: *common - group: AIX steps: diff --git a/lldb/include/lldb/Target/UnixSignals.h b/lldb/include/lldb/Target/UnixSignals.h index a1807d69f329b..590e4d1aa5208 100644 --- a/lldb/include/lldb/Target/UnixSignals.h +++ b/lldb/include/lldb/Target/UnixSignals.h @@ -31,6 +31,8 @@ class UnixSignals { llvm::StringRef GetSignalAsStringRef(int32_t signo) const; + llvm::StringRef GetSignalNumberDescription(int32_t signo) const; + std::string GetSignalDescription(int32_t signo, std::optional code = std::nullopt, diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index 7d326404a5503..c17f12fae6e79 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -1603,8 +1603,8 @@ class CommandObjectProcessHandle : public CommandObjectParsed { Options *GetOptions() override { return &m_options; } void PrintSignalHeader(Stream &str) { - str.Printf("NAME PASS STOP NOTIFY\n"); - str.Printf("=========== ===== ===== ======\n"); + str.Printf("NAME PASS STOP NOTIFY DESCRIPTION\n"); + str.Printf("=========== ===== ===== ====== ===================\n"); } void PrintSignal(Stream &str, int32_t signo, llvm::StringRef sig_name, @@ -1615,9 +1615,16 @@ class CommandObjectProcessHandle : public CommandObjectParsed { str.Format("{0, -11} ", sig_name); if (signals_sp->GetSignalInfo(signo, suppress, stop, notify)) { - bool pass = !suppress; + const bool pass = !suppress; str.Printf("%s %s %s", (pass ? "true " : "false"), (stop ? "true " : "false"), (notify ? "true " : "false")); + + const llvm::StringRef sig_description = + signals_sp->GetSignalNumberDescription(signo); + if (!sig_description.empty()) { + str.PutCString(" "); + str.PutCString(sig_description); + } } str.Printf("\n"); } diff --git a/lldb/source/Target/UnixSignals.cpp b/lldb/source/Target/UnixSignals.cpp index 6113c6648817c..881431f4631e5 100644 --- a/lldb/source/Target/UnixSignals.cpp +++ b/lldb/source/Target/UnixSignals.cpp @@ -137,6 +137,13 @@ llvm::StringRef UnixSignals::GetSignalAsStringRef(int32_t signo) const { return pos->second.m_name; } +llvm::StringRef UnixSignals::GetSignalNumberDescription(int32_t signo) const { + const auto pos = m_signals.find(signo); + if (pos == m_signals.end()) + return {}; + return pos->second.m_description; +} + std::string UnixSignals::GetSignalDescription( int32_t signo, std::optional code, std::optional addr, std::optional lower, diff --git a/lldb/unittests/Signals/UnixSignalsTest.cpp b/lldb/unittests/Signals/UnixSignalsTest.cpp index 582e441556067..3bd4aedd600a3 100644 --- a/lldb/unittests/Signals/UnixSignalsTest.cpp +++ b/lldb/unittests/Signals/UnixSignalsTest.cpp @@ -148,6 +148,18 @@ TEST(UnixSignalsTest, GetAsString) { signals.GetSignalDescription(16, 3, 0x1233, 0x1234, 0x5678)); } +TEST(UnixSignalsTest, GetNumberDescription) { + TestSignals signals; + + ASSERT_EQ("DESC2", signals.GetSignalNumberDescription(2)); + ASSERT_EQ("DESC4", signals.GetSignalNumberDescription(4)); + ASSERT_EQ("DESC8", signals.GetSignalNumberDescription(8)); + ASSERT_EQ("DESC16", signals.GetSignalNumberDescription(16)); + + // Unknown signal number. + ASSERT_EQ("", signals.GetSignalNumberDescription(100)); +} + TEST(UnixSignalsTest, VersionChange) { TestSignals signals; diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index 654a5f10cea96..2c8484fde5b16 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -451,12 +451,10 @@ class InductionDescriptor { : Instruction::BinaryOpsEnd; } - /// Returns a reference to the type cast instructions in the induction + /// Returns an ArrayRef to the type cast instructions in the induction /// update chain, that are redundant when guarded with a runtime /// SCEV overflow check. - const SmallVectorImpl &getCastInsts() const { - return RedundantCasts; - } + ArrayRef getCastInsts() const { return RedundantCasts; } private: /// Private constructor - used by \c isInductionPHI. diff --git a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h index f9995917363f9..9f14c8b2efd5f 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h @@ -500,7 +500,7 @@ template class WaitingOnGraph { if (I == SN->Deps.end()) continue; for (auto &[DefElem, DefSN] : DefElems) - if (I->second.erase(DefElem)) + if (I->second.erase(DefElem) && DefSN != SN.get()) SNDeps.insert(DefSN); if (I->second.empty()) SN->Deps.erase(I); @@ -511,11 +511,13 @@ template class WaitingOnGraph { // Compute transitive closure of deps for each node. static void propagateSuperNodeDeps(SuperNodeDepsMap &SuperNodeDeps) { for (auto &[SN, Deps] : SuperNodeDeps) { - DenseSet Reachable({SN}); + DenseSet Reachable; SmallVector Worklist(Deps.begin(), Deps.end()); while (!Worklist.empty()) { auto *DepSN = Worklist.pop_back_val(); + if (DepSN == SN) + continue; if (!Reachable.insert(DepSN).second) continue; auto I = SuperNodeDeps.find(DepSN); @@ -537,9 +539,11 @@ template class WaitingOnGraph { if (I == SuperNodeDeps.end()) continue; - for (auto *DepSN : I->second) + for (auto *DepSN : I->second) { + assert(DepSN != SN.get() && "Unexpected self-dependence for SN"); for (auto &[Container, Elems] : DepSN->Deps) SN->Deps[Container].insert(Elems.begin(), Elems.end()); + } } } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 001f0444eb910..f7934dd7e755f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2507,7 +2507,7 @@ class AMDGPUWaveReduce : Intrinsic< multiclass AMDGPUWaveReduceOps { foreach Op = - ["umin", "fmin", "min", "umax", "fmax", "max", "add", "sub", "and", "or", "xor"] in { + ["umin", "fmin", "min", "umax", "fmax", "max", "add", "fadd", "sub", "fsub", "and", "or", "xor"] in { def Op : AMDGPUWaveReduce; } } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 641850b46bbd8..26c7f805bdb6d 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -270,10 +270,12 @@ bool RecurrenceDescriptor::AddReductionVar( // resulting from the type promotion performed by InstCombine. Vector // operations are not limited to the legal integer widths, so we may be able // to evaluate the reduction in the narrower width. - if (RecurrenceType->isFloatingPointTy()) { + // Check the scalar type to handle both scalar and vector types. + Type *ScalarTy = RecurrenceType->getScalarType(); + if (ScalarTy->isFloatingPointTy()) { if (!isFloatingPointRecurrenceKind(Kind)) return false; - } else if (RecurrenceType->isIntegerTy()) { + } else if (ScalarTy->isIntegerTy()) { if (!isIntegerRecurrenceKind(Kind)) return false; if (!isMinMaxRecurrenceKind(Kind)) diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp index 2aede017133b0..84f27c4938050 100644 --- a/llvm/lib/CAS/OnDiskGraphDB.cpp +++ b/llvm/lib/CAS/OnDiskGraphDB.cpp @@ -452,13 +452,6 @@ Expected DataRecordHandle::createWithError( return Mem.takeError(); } -DataRecordHandle -DataRecordHandle::create(function_ref Alloc, - const Input &I) { - Layout L(I); - return constructImpl(Alloc(L.getTotalSize()), I, L); -} - ObjectHandle ObjectHandle::fromFileOffset(FileOffset Offset) { // Store the file offset as it is. assert(!(Offset.get() & 0x1)); @@ -621,10 +614,6 @@ bool TrieRecord::compare_exchange_strong(Data &Existing, Data New) { return false; } -DataRecordHandle DataRecordHandle::construct(char *Mem, const Input &I) { - return constructImpl(Mem, I, Layout(I)); -} - Expected DataRecordHandle::getFromDataPool(const OnDiskDataAllocator &Pool, FileOffset Offset) { diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index d029ac587fb9a..9233c8ed8d0dd 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -713,12 +713,7 @@ JITDylib::defineMaterializing(MaterializationResponsibility &FromMR, std::vector AddedSyms; std::vector RejectedWeakDefs; - for (auto SFItr = SymbolFlags.begin(), SFEnd = SymbolFlags.end(); - SFItr != SFEnd; ++SFItr) { - - auto &Name = SFItr->first; - auto &Flags = SFItr->second; - + for (auto &[Name, Flags] : SymbolFlags) { auto EntryItr = Symbols.find(Name); // If the entry already exists... diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0bbe117ecf51f..e91f5a877b35b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1809,11 +1809,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (!Subtarget->hasSVEB16B16() || !Subtarget->isNonStreamingSVEorSME2Available()) { - for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM, - ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) { - setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32); - setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32); - setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32); + for (MVT VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { + MVT PromotedVT = VT.changeVectorElementType(MVT::f32); + setOperationPromotedToType(ISD::FADD, VT, PromotedVT); + setOperationPromotedToType(ISD::FMA, VT, PromotedVT); + setOperationPromotedToType(ISD::FMAXIMUM, VT, PromotedVT); + setOperationPromotedToType(ISD::FMAXNUM, VT, PromotedVT); + setOperationPromotedToType(ISD::FMINIMUM, VT, PromotedVT); + setOperationPromotedToType(ISD::FMINNUM, VT, PromotedVT); + setOperationPromotedToType(ISD::FSUB, VT, PromotedVT); + + if (VT != MVT::nxv2bf16 && Subtarget->hasBF16()) + setOperationAction(ISD::FMUL, VT, Custom); + else + setOperationPromotedToType(ISD::FMUL, VT, PromotedVT); } } @@ -7670,6 +7679,57 @@ SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, EndOfTrmp); } +SDValue AArch64TargetLowering::LowerFMUL(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + if (VT.getScalarType() != MVT::bf16 || + (Subtarget->hasSVEB16B16() && + Subtarget->isNonStreamingSVEorSME2Available())) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); + + assert(Subtarget->hasBF16() && "Expected +bf16 for custom FMUL lowering"); + assert((VT == MVT::nxv4bf16 || VT == MVT::nxv8bf16) && "Unexpected FMUL VT"); + + auto MakeGetIntrinsic = [&](Intrinsic::ID IID) { + return [&, IID](EVT VT, auto... Ops) { + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(IID, DL, MVT::i32), Ops...); + }; + }; + + auto ReinterpretCast = [&](SDValue Value, EVT VT) { + if (VT == Value.getValueType()) + return Value; + return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Value); + }; + + // Create helpers for building intrinsic calls. + auto BFMLALB = MakeGetIntrinsic(Intrinsic::aarch64_sve_bfmlalb); + auto BFMLALT = MakeGetIntrinsic(Intrinsic::aarch64_sve_bfmlalt); + auto FCVT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvt_bf16f32_v2); + auto FCVTNT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2); + + // All intrinsics expect to operate on full bf16 vector types. + SDValue LHS = ReinterpretCast(Op.getOperand(0), MVT::nxv8bf16); + SDValue RHS = ReinterpretCast(Op.getOperand(1), MVT::nxv8bf16); + + SDValue Zero = + DAG.getNeutralElement(ISD::FADD, DL, MVT::nxv4f32, Op->getFlags()); + SDValue Pg = DAG.getConstant(1, DL, MVT::nxv4i1); + + // Lower bf16 FMUL as a pair (VT == nxv8bf16) of BFMLAL top/bottom + // instructions. These result in two f32 vectors, which can be converted back + // to bf16 with FCVT and FCVTNT. + SDValue BottomF32 = BFMLALB(MVT::nxv4f32, Zero, LHS, RHS); + SDValue BottomBF16 = + FCVT(MVT::nxv8bf16, DAG.getPOISON(MVT::nxv8bf16), Pg, BottomF32); + // Note: nxv4bf16 only uses even lanes. + if (VT == MVT::nxv4bf16) + return ReinterpretCast(BottomBF16, VT); + SDValue TopF32 = BFMLALT(MVT::nxv4f32, Zero, LHS, RHS); + return FCVTNT(VT, BottomBF16, Pg, TopF32); +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -7744,7 +7804,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::FSUB: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); case ISD::FMUL: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); + return LowerFMUL(Op, DAG); case ISD::FMA: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); case ISD::FDIV: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index be198e54cbcbf..ca08eb40c956a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -614,6 +614,7 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerStore128(SDValue Op, SelectionDAG &DAG) const; SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c27ecf2887691..d9d24556027ee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5214,7 +5214,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_fadd: case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_fsub: case Intrinsic::amdgcn_wave_reduce_min: case Intrinsic::amdgcn_wave_reduce_umin: case Intrinsic::amdgcn_wave_reduce_fmin: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a6920dfb88fb6..b03683ad7eec9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5502,6 +5502,10 @@ static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) { return std::numeric_limits::min(); case AMDGPU::S_MAX_I32: return std::numeric_limits::min(); + case AMDGPU::V_ADD_F32_e64: // -0.0 + return 0x80000000; + case AMDGPU::V_SUB_F32_e64: // +0.0 + return 0x0; case AMDGPU::S_ADD_I32: case AMDGPU::S_SUB_I32: case AMDGPU::S_OR_B32: @@ -5547,11 +5551,13 @@ static bool is32bitWaveReduceOperation(unsigned Opc) { Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 || Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 || - Opc == AMDGPU::V_MAX_F32_e64; + Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 || + Opc == AMDGPU::V_SUB_F32_e64; } static bool isFloatingPointWaveReduceOperation(unsigned Opc) { - return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64; + return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 || + Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64; } static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, @@ -5598,8 +5604,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, case AMDGPU::S_XOR_B64: case AMDGPU::S_ADD_I32: case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::V_ADD_F32_e64: case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U64_PSEUDO: { + case AMDGPU::S_SUB_U64_PSEUDO: + case AMDGPU::V_SUB_F32_e64: { const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); @@ -5754,6 +5762,30 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, .addImm(AMDGPU::sub1); break; } + case AMDGPU::V_ADD_F32_e64: + case AMDGPU::V_SUB_F32_e64: { + Register ActiveLanesVreg = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DstVreg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + // Get number of active lanes as a float val. + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_CVT_F32_I32_e64), + ActiveLanesVreg) + .addReg(NewAccumulator->getOperand(0).getReg()) + .addImm(0) // clamp + .addImm(0); // output-modifier + + // Take negation of input for SUB reduction + unsigned srcMod = Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0; + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_F32_e64), DstVreg) + .addImm(srcMod) // src0 modifier + .addReg(SrcReg) + .addImm(0) // src1 modifier + .addReg(ActiveLanesVreg) + .addImm(0) // clamp + .addImm(0); // output-mod + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(DstVreg); + } } RetBB = &BB; } @@ -6001,10 +6033,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO); + case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64); case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO); + case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64); case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 0201a98a54cc7..2e0dc2fb9d2bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -374,6 +374,8 @@ defvar Operations = [ WaveReduceOp<"fmin", "F32", f32, SGPR_32, VSrc_b32>, WaveReduceOp<"fmax", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fadd", "F32", f32, SGPR_32, VSrc_b32>, + WaveReduceOp<"fsub", "F32", f32, SGPR_32, VSrc_b32>, ]; foreach Op = Operations in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 9cb53fb27a2d2..84b962b2a8607 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -768,7 +768,7 @@ def BGE : BranchCC_rri<0b101, "bge">; def BLTU : BranchCC_rri<0b110, "bltu">; def BGEU : BranchCC_rri<0b111, "bgeu">; -let IsSignExtendingOpW = 1, canFoldAsLoad = 1 in { +let IsSignExtendingOpW = 1, canFoldAsLoad = 1, isReMaterializable = 1 in { def LB : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>; def LH : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>; def LW : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>; @@ -889,7 +889,7 @@ def CSRRCI : CSR_ii<0b111, "csrrci">; /// RV64I instructions let Predicates = [IsRV64] in { -let canFoldAsLoad = 1 in { +let canFoldAsLoad = 1, isReMaterializable = 1 in { def LWU : Load_ri<0b110, "lwu">, Sched<[WriteLDW, ReadMemBase]>; def LD : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 4ffe3e62ac501..deacd41e6469a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -71,7 +71,7 @@ defvar DExtsRV64 = [DExt, ZdinxExt]; //===----------------------------------------------------------------------===// let Predicates = [HasStdExtD] in { -let canFoldAsLoad = 1 in +let canFoldAsLoad = 1, isReMaterializable = 1 in def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index b30f8ec820c15..bd191001b75ec 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -330,7 +330,7 @@ class PseudoFROUND //===----------------------------------------------------------------------===// let Predicates = [HasStdExtF] in { -let canFoldAsLoad = 1 in +let canFoldAsLoad = 1, isReMaterializable = 1 in def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index 1c6a5afcda49b..c172d1739ba61 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -90,7 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext]; //===----------------------------------------------------------------------===// let Predicates = [HasHalfFPLoadStoreMove] in { -let canFoldAsLoad = 1 in +let canFoldAsLoad = 1, isReMaterializable = 1 in def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>; // Operands for stores are in the order srcreg, base, offset rather than diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index bd754d17694b8..00f750b88a608 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -494,8 +494,8 @@ MCRegister SPIRVModuleAnalysis::handleVariable( void SPIRVModuleAnalysis::collectDeclarations(const Module &M) { InstrGRegsMap SignatureToGReg; std::map GlobalToGReg; - for (auto F = M.begin(), E = M.end(); F != E; ++F) { - MachineFunction *MF = MMI->getMachineFunction(*F); + for (const Function &F : M) { + MachineFunction *MF = MMI->getMachineFunction(F); if (!MF) continue; const MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -634,10 +634,10 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, // be correctly collected until these registers are globally numbered. void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { InstrTraces IS; - for (auto F = M.begin(), E = M.end(); F != E; ++F) { - if (F->isDeclaration()) + for (const Function &F : M) { + if (F.isDeclaration()) continue; - MachineFunction *MF = MMI->getMachineFunction(*F); + MachineFunction *MF = MMI->getMachineFunction(F); assert(MF); for (MachineBasicBlock &MBB : *MF) @@ -669,13 +669,13 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { collectOtherInstr(MI, MAI, SPIRV::MB_AliasingInsts, IS); } else if (TII->isDecorationInstr(MI)) { collectOtherInstr(MI, MAI, SPIRV::MB_Annotations, IS); - collectFuncNames(MI, &*F); + collectFuncNames(MI, &F); } else if (TII->isConstantInstr(MI)) { // Now OpSpecConstant*s are not in DT, // but they need to be collected anyway. collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, IS); } else if (OpCode == SPIRV::OpFunction) { - collectFuncNames(MI, &*F); + collectFuncNames(MI, &F); } else if (OpCode == SPIRV::OpTypeForwardPointer) { collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, IS, false); } @@ -687,10 +687,10 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) { // the result in global register alias table. Some registers are already // numbered. void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) { - for (auto F = M.begin(), E = M.end(); F != E; ++F) { - if ((*F).isDeclaration()) + for (const Function &F : M) { + if (F.isDeclaration()) continue; - MachineFunction *MF = MMI->getMachineFunction(*F); + MachineFunction *MF = MMI->getMachineFunction(F); assert(MF); for (MachineBasicBlock &MBB : *MF) { for (MachineInstr &MI : MBB) { @@ -2169,8 +2169,8 @@ void addInstrRequirements(const MachineInstr &MI, static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, MachineModuleInfo *MMI, const SPIRVSubtarget &ST) { // Collect requirements for existing instructions. - for (auto F = M.begin(), E = M.end(); F != E; ++F) { - MachineFunction *MF = MMI->getMachineFunction(*F); + for (const Function &F : M) { + MachineFunction *MF = MMI->getMachineFunction(F); if (!MF) continue; for (const MachineBasicBlock &MBB : *MF) @@ -2250,8 +2250,7 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, if (RequireKHRFloatControls2) MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_float_controls2); } - for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) { - const Function &F = *FI; + for (const Function &F : M) { if (F.isDeclaration()) continue; if (F.getMetadata("reqd_work_group_size")) @@ -2431,23 +2430,23 @@ static void addDecorations(const Module &M, const SPIRVInstrInfo &TII, MachineModuleInfo *MMI, const SPIRVSubtarget &ST, SPIRV::ModuleAnalysisInfo &MAI, const SPIRVGlobalRegistry *GR) { - for (auto F = M.begin(), E = M.end(); F != E; ++F) { - MachineFunction *MF = MMI->getMachineFunction(*F); + for (const Function &F : M) { + MachineFunction *MF = MMI->getMachineFunction(F); if (!MF) continue; for (auto &MBB : *MF) for (auto &MI : MBB) handleMIFlagDecoration(MI, ST, TII, MAI.Reqs, GR, - MAI.FPFastMathDefaultInfoMap[&(*F)]); + MAI.FPFastMathDefaultInfoMap[&F]); } } static void addMBBNames(const Module &M, const SPIRVInstrInfo &TII, MachineModuleInfo *MMI, const SPIRVSubtarget &ST, SPIRV::ModuleAnalysisInfo &MAI) { - for (auto F = M.begin(), E = M.end(); F != E; ++F) { - MachineFunction *MF = MMI->getMachineFunction(*F); + for (const Function &F : M) { + MachineFunction *MF = MMI->getMachineFunction(F); if (!MF) continue; MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -2467,8 +2466,8 @@ static void addMBBNames(const Module &M, const SPIRVInstrInfo &TII, // patching Instruction::PHI to SPIRV::OpPhi static void patchPhis(const Module &M, SPIRVGlobalRegistry *GR, const SPIRVInstrInfo &TII, MachineModuleInfo *MMI) { - for (auto F = M.begin(), E = M.end(); F != E; ++F) { - MachineFunction *MF = MMI->getMachineFunction(*F); + for (const Function &F : M) { + MachineFunction *MF = MMI->getMachineFunction(F); if (!MF) continue; for (auto &MBB : *MF) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 18a45c6799bac..98e2d9ebe4fc2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -140,8 +140,8 @@ Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI, Value *Elt = EI.getIndexOperand(); // If the operand is the PHI induction variable: if (PHIInVal == PHIUser) { - // Scalarize the binary operation. Its first operand is the - // scalar PHI, and the second operand is extracted from the other + // Scalarize the binary operation. One operand is the + // scalar PHI, and the other is extracted from the other // vector operand. BinaryOperator *B0 = cast(PHIUser); unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0; @@ -149,9 +149,14 @@ Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI, ExtractElementInst::Create(B0->getOperand(opId), Elt, B0->getOperand(opId)->getName() + ".Elt"), B0->getIterator()); - Value *newPHIUser = InsertNewInstWith( - BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(), - scalarPHI, Op, B0), B0->getIterator()); + // Preserve operand order for binary operation to preserve semantics of + // non-commutative operations. + Value *FirstOp = (B0->getOperand(0) == PN) ? scalarPHI : Op; + Value *SecondOp = (B0->getOperand(0) == PN) ? Op : scalarPHI; + Value *newPHIUser = + InsertNewInstWith(BinaryOperator::CreateWithCopiedFlags( + B0->getOpcode(), FirstOp, SecondOp, B0), + B0->getIterator()); scalarPHI->addIncoming(newPHIUser, inBB); } else { // Scalarize PHI input: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 86e742ca5fec1..ba21bbbe112e6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -684,7 +684,7 @@ void LoopVectorizationLegality::addInductionPhi( // in the vectorized loop body, record them here. All casts could be recorded // here for ignoring, but suffices to record only the first (as it is the // only one that may bw used outside the cast sequence). - const SmallVectorImpl &Casts = ID.getCastInsts(); + ArrayRef Casts = ID.getCastInsts(); if (!Casts.empty()) InductionCastsToIgnore.insert(*Casts.begin()); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 277e43a38018e..7ac132a99fbec 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6575,8 +6575,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // detection. for (const auto &Induction : Legal->getInductionVars()) { const InductionDescriptor &IndDes = Induction.second; - const SmallVectorImpl &Casts = IndDes.getCastInsts(); - VecValuesToIgnore.insert_range(Casts); + VecValuesToIgnore.insert_range(IndDes.getCastInsts()); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 003a049da91e4..e7a8773be067b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -512,7 +512,7 @@ static void removeRedundantInductionCasts(VPlan &Plan) { // replace it with the original IV. Note that only the final cast is // expected to have users outside the cast-chain and the dead casts left // over will be cleaned up later. - auto &Casts = IV->getInductionDescriptor().getCastInsts(); + ArrayRef Casts = IV->getInductionDescriptor().getCastInsts(); VPValue *FindMyCast = IV; for (Instruction *IRCast : reverse(Casts)) { VPSingleDefRecipe *FoundUserCast = nullptr; @@ -1489,6 +1489,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // operand against vputils::isSingleScalar. assert(RepOrWidenR != Store->getStoredValue() || vputils::isSingleScalar(Store->getStoredValue())); + (void)Store; return true; } diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll index 2103bc30b8381..6917ac12999bf 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16 +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16,NOB16B16-NONSTREAMING ; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16 -; RUN: llc -mattr=+sme,+sve-b16b16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16 +; RUN: llc -mattr=+sme,+sve-b16b16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16,NOB16B16-STREAMING ; RUN: llc -mattr=+sme2,+sve-b16b16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,B16B16 target triple = "aarch64-unknown-linux-gnu" @@ -530,49 +530,80 @@ define @fmul_nxv2bf16( %a, %a, %b + %res = fmul nsz %a, %b ret %res } define @fmul_nxv4bf16( %a, %b) { -; NOB16B16-LABEL: fmul_nxv4bf16: -; NOB16B16: // %bb.0: -; NOB16B16-NEXT: lsl z1.s, z1.s, #16 -; NOB16B16-NEXT: lsl z0.s, z0.s, #16 -; NOB16B16-NEXT: ptrue p0.s -; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s -; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s -; NOB16B16-NEXT: ret +; NOB16B16-NONSTREAMING-LABEL: fmul_nxv4bf16: +; NOB16B16-NONSTREAMING: // %bb.0: +; NOB16B16-NONSTREAMING-NEXT: movi v2.2d, #0000000000000000 +; NOB16B16-NONSTREAMING-NEXT: ptrue p0.s +; NOB16B16-NONSTREAMING-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NONSTREAMING-NEXT: bfcvt z0.h, p0/m, z2.s +; NOB16B16-NONSTREAMING-NEXT: ret ; ; B16B16-LABEL: fmul_nxv4bf16: ; B16B16: // %bb.0: ; B16B16-NEXT: ptrue p0.s ; B16B16-NEXT: bfmul z0.h, p0/m, z0.h, z1.h ; B16B16-NEXT: ret - %res = fmul %a, %b +; +; NOB16B16-STREAMING-LABEL: fmul_nxv4bf16: +; NOB16B16-STREAMING: // %bb.0: +; NOB16B16-STREAMING-NEXT: mov z2.s, #0 // =0x0 +; NOB16B16-STREAMING-NEXT: ptrue p0.s +; NOB16B16-STREAMING-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-STREAMING-NEXT: bfcvt z0.h, p0/m, z2.s +; NOB16B16-STREAMING-NEXT: ret + %res = fmul nsz %a, %b ret %res } define @fmul_nxv8bf16( %a, %b) { -; NOB16B16-LABEL: fmul_nxv8bf16: +; NOB16B16-NONSTREAMING-LABEL: fmul_nxv8bf16: +; NOB16B16-NONSTREAMING: // %bb.0: +; NOB16B16-NONSTREAMING-NEXT: movi v2.2d, #0000000000000000 +; NOB16B16-NONSTREAMING-NEXT: movi v3.2d, #0000000000000000 +; NOB16B16-NONSTREAMING-NEXT: ptrue p0.s +; NOB16B16-NONSTREAMING-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NONSTREAMING-NEXT: bfmlalt z3.s, z0.h, z1.h +; NOB16B16-NONSTREAMING-NEXT: bfcvt z0.h, p0/m, z2.s +; NOB16B16-NONSTREAMING-NEXT: bfcvtnt z0.h, p0/m, z3.s +; NOB16B16-NONSTREAMING-NEXT: ret +; +; B16B16-LABEL: fmul_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfmul z0.h, z0.h, z1.h +; B16B16-NEXT: ret +; +; NOB16B16-STREAMING-LABEL: fmul_nxv8bf16: +; NOB16B16-STREAMING: // %bb.0: +; NOB16B16-STREAMING-NEXT: mov z2.s, #0 // =0x0 +; NOB16B16-STREAMING-NEXT: mov z3.s, #0 // =0x0 +; NOB16B16-STREAMING-NEXT: ptrue p0.s +; NOB16B16-STREAMING-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-STREAMING-NEXT: bfmlalt z3.s, z0.h, z1.h +; NOB16B16-STREAMING-NEXT: bfcvt z0.h, p0/m, z2.s +; NOB16B16-STREAMING-NEXT: bfcvtnt z0.h, p0/m, z3.s +; NOB16B16-STREAMING-NEXT: ret + %res = fmul nsz %a, %b + ret %res +} + +define @fmul_nxv8bf16_no_nsz( %a, %b) { +; NOB16B16-LABEL: fmul_nxv8bf16_no_nsz: ; NOB16B16: // %bb.0: -; NOB16B16-NEXT: uunpkhi z2.s, z1.h -; NOB16B16-NEXT: uunpkhi z3.s, z0.h -; NOB16B16-NEXT: uunpklo z1.s, z1.h -; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: mov z2.s, #0x80000000 +; NOB16B16-NEXT: mov z3.s, #0x80000000 ; NOB16B16-NEXT: ptrue p0.s -; NOB16B16-NEXT: lsl z2.s, z2.s, #16 -; NOB16B16-NEXT: lsl z3.s, z3.s, #16 -; NOB16B16-NEXT: lsl z1.s, z1.s, #16 -; NOB16B16-NEXT: lsl z0.s, z0.s, #16 -; NOB16B16-NEXT: fmul z2.s, z3.s, z2.s -; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s -; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s -; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s -; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: bfmlalb z2.s, z0.h, z1.h +; NOB16B16-NEXT: bfmlalt z3.s, z0.h, z1.h +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z2.s +; NOB16B16-NEXT: bfcvtnt z0.h, p0/m, z3.s ; NOB16B16-NEXT: ret ; -; B16B16-LABEL: fmul_nxv8bf16: +; B16B16-LABEL: fmul_nxv8bf16_no_nsz: ; B16B16: // %bb.0: ; B16B16-NEXT: bfmul z0.h, z0.h, z1.h ; B16B16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll index be721930cf015..a8049806a679b 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-combines.ll @@ -410,28 +410,21 @@ define @fsub_sel_negzero_nxv8bf16( %a define @fadd_sel_fmul_nxv8bf16( %a, %b, %c, %mask) { ; SVE-LABEL: fadd_sel_fmul_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: mov z3.s, #0x80000000 +; SVE-NEXT: mov z4.s, #0x80000000 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s -; SVE-NEXT: movi v3.2d, #0000000000000000 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s -; SVE-NEXT: uzp1 z1.h, z1.h, z2.h -; SVE-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfmlalt z4.s, z1.h, z2.h +; SVE-NEXT: movi v2.2d, #0000000000000000 +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s ; SVE-NEXT: uunpkhi z3.s, z0.h ; SVE-NEXT: uunpklo z0.s, z0.h -; SVE-NEXT: uunpkhi z2.s, z1.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: bfcvtnt z1.h, p1/m, z4.s ; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: sel z1.h, p0, z1.h, z2.h +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h ; SVE-NEXT: lsl z2.s, z2.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: fadd z2.s, z3.s, z2.s @@ -457,24 +450,20 @@ define @fadd_sel_fmul_nxv8bf16( %a, < define @fsub_sel_fmul_nxv8bf16( %a, %b, %c, %mask) { ; SVE-LABEL: fsub_sel_fmul_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: mov z3.s, #0x80000000 +; SVE-NEXT: mov z4.s, #0x80000000 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: uunpklo z4.s, z0.h -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfmlalt z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s ; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: bfcvtnt z1.h, p1/m, z4.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h ; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s ; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: fsub z2.s, z3.s, z2.s ; SVE-NEXT: fsub z1.s, z4.s, z1.s @@ -497,24 +486,20 @@ define @fsub_sel_fmul_nxv8bf16( %a, < define @fadd_sel_fmul_nsz_nxv8bf16( %a, %b, %c, %mask) { ; SVE-LABEL: fadd_sel_fmul_nsz_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: mov z3.s, #0x80000000 +; SVE-NEXT: mov z4.s, #0x80000000 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: uunpklo z4.s, z0.h -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfmlalt z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s ; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: bfcvtnt z1.h, p1/m, z4.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h ; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s ; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: fadd z2.s, z3.s, z2.s ; SVE-NEXT: fadd z1.s, z4.s, z1.s @@ -537,24 +522,20 @@ define @fadd_sel_fmul_nsz_nxv8bf16( % define @fsub_sel_fmul_nsz_nxv8bf16( %a, %b, %c, %mask) { ; SVE-LABEL: fsub_sel_fmul_nsz_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: mov z3.s, #0x80000000 +; SVE-NEXT: mov z4.s, #0x80000000 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: uunpklo z4.s, z0.h -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfmlalt z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s ; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: bfcvtnt z1.h, p1/m, z4.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h ; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s ; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: fsub z2.s, z3.s, z2.s ; SVE-NEXT: fsub z1.s, z4.s, z1.s @@ -577,24 +558,20 @@ define @fsub_sel_fmul_nsz_nxv8bf16( % define @fadd_sel_fmul_negzero_nxv8bf16( %a, %b, %c, %mask) { ; SVE-LABEL: fadd_sel_fmul_negzero_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: mov z3.s, #0x80000000 +; SVE-NEXT: mov z4.s, #0x80000000 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: uunpklo z4.s, z0.h -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfmlalt z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s ; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: bfcvtnt z1.h, p1/m, z4.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h ; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s ; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: fadd z2.s, z3.s, z2.s ; SVE-NEXT: fadd z1.s, z4.s, z1.s @@ -618,28 +595,21 @@ define @fadd_sel_fmul_negzero_nxv8bf16( @fsub_sel_fmul_negzero_nxv8bf16( %a, %b, %c, %mask) { ; SVE-LABEL: fsub_sel_fmul_negzero_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: mov z3.s, #0x80000000 +; SVE-NEXT: mov z4.s, #0x80000000 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s -; SVE-NEXT: dupm z3.h, #0x8000 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s -; SVE-NEXT: uzp1 z1.h, z1.h, z2.h -; SVE-NEXT: sel z1.h, p0, z1.h, z3.h +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfmlalt z4.s, z1.h, z2.h +; SVE-NEXT: dupm z2.h, #0x8000 +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s ; SVE-NEXT: uunpkhi z3.s, z0.h ; SVE-NEXT: uunpklo z0.s, z0.h -; SVE-NEXT: uunpkhi z2.s, z1.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: bfcvtnt z1.h, p1/m, z4.s ; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z0.s, z0.s, #16 +; SVE-NEXT: sel z1.h, p0, z1.h, z2.h +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h ; SVE-NEXT: lsl z2.s, z2.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: fsub z2.s, z3.s, z2.s @@ -666,24 +636,20 @@ define @fsub_sel_fmul_negzero_nxv8bf16( @fadd_sel_fmul_negzero_nsz_nxv8bf16( %a, %b, %c, %mask) { ; SVE-LABEL: fadd_sel_fmul_negzero_nsz_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: mov z3.s, #0x80000000 +; SVE-NEXT: mov z4.s, #0x80000000 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: uunpklo z4.s, z0.h -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfmlalt z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s ; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: bfcvtnt z1.h, p1/m, z4.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h ; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s ; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: fadd z2.s, z3.s, z2.s ; SVE-NEXT: fadd z1.s, z4.s, z1.s @@ -707,24 +673,20 @@ define @fadd_sel_fmul_negzero_nsz_nxv8bf16( @fsub_sel_fmul_negzero_nsz_nxv8bf16( %a, %b, %c, %mask) { ; SVE-LABEL: fsub_sel_fmul_negzero_nsz_nxv8bf16: ; SVE: // %bb.0: -; SVE-NEXT: uunpkhi z3.s, z2.h -; SVE-NEXT: uunpkhi z4.s, z1.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z1.s, z1.h +; SVE-NEXT: mov z3.s, #0x80000000 +; SVE-NEXT: mov z4.s, #0x80000000 ; SVE-NEXT: ptrue p1.s -; SVE-NEXT: lsl z3.s, z3.s, #16 -; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z1.s, z1.s, #16 -; SVE-NEXT: fmul z3.s, z4.s, z3.s -; SVE-NEXT: uunpklo z4.s, z0.h -; SVE-NEXT: fmul z1.s, z1.s, z2.s -; SVE-NEXT: bfcvt z2.h, p1/m, z3.s +; SVE-NEXT: bfmlalb z3.s, z1.h, z2.h +; SVE-NEXT: bfmlalt z4.s, z1.h, z2.h +; SVE-NEXT: bfcvt z1.h, p1/m, z3.s ; SVE-NEXT: uunpkhi z3.s, z0.h +; SVE-NEXT: bfcvtnt z1.h, p1/m, z4.s +; SVE-NEXT: uunpklo z4.s, z0.h +; SVE-NEXT: lsl z3.s, z3.s, #16 +; SVE-NEXT: uunpkhi z2.s, z1.h +; SVE-NEXT: uunpklo z1.s, z1.h ; SVE-NEXT: lsl z4.s, z4.s, #16 -; SVE-NEXT: bfcvt z1.h, p1/m, z1.s ; SVE-NEXT: lsl z2.s, z2.s, #16 -; SVE-NEXT: lsl z3.s, z3.s, #16 ; SVE-NEXT: lsl z1.s, z1.s, #16 ; SVE-NEXT: fsub z2.s, z3.s, z2.s ; SVE-NEXT: fsub z1.s, z4.s, z1.s diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll index a7ebf458d2591..2cb1811ff4f09 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll @@ -2019,6 +2019,1007 @@ endif: store i64 %combine, ptr addrspace(1) %out ret void } + +define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) { +; GFX8DAGISEL-LABEL: uniform_value_float: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_float: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_float: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_float: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value_float: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value_float: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value_float: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value_float: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_float: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_float: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_float: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_float: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: uniform_value_float: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mul_f32_e32 v0, s2, v0 +; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12DAGISEL-NEXT: s_wait_alu 0xf1ff +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %result = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1) + store float %result, ptr addrspace(1) %out + ret void +} + +define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) { +; GFX8DAGISEL-LABEL: divergent_value_float: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8GISEL-LABEL: divergent_value_float: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_brev_b32 s6, 1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: v_add_f32_e32 v3, s6, v3 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9DAGISEL-LABEL: divergent_value_float: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: v_add_f32_e32 v3, s6, v3 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GISEL-LABEL: divergent_value_float: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_brev_b32 s6, 1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: v_add_f32_e32 v3, s6, v3 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9GISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064DAGISEL-LABEL: divergent_value_float: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: v_add_f32_e64 v3, s6, s8 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064DAGISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064GISEL-LABEL: divergent_value_float: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_brev_b32 s6, 1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: v_add_f32_e64 v3, s6, s8 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064GISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032DAGISEL-LABEL: divergent_value_float: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_brev_b32 s5, 1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: v_add_f32_e64 v3, s5, s7 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032GISEL-LABEL: divergent_value_float: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_brev_b32 s5, 1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: v_add_f32_e64 v3, s5, s7 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032GISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-LABEL: divergent_value_float: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_brev_b32 s2, 1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-NEXT: v_add_f32_e64 v3, s2, s4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-LABEL: divergent_value_float: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_brev_b32 s2, 1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-NEXT: v_add_f32_e64 v3, s2, s4 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-LABEL: divergent_value_float: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132DAGISEL-NEXT: v_add_f32_e64 v3, s1, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-LABEL: divergent_value_float: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132GISEL-NEXT: s_brev_b32 s1, 1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132GISEL-NEXT: v_add_f32_e64 v3, s1, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12DAGISEL-LABEL: divergent_value_float: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0 +; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX12DAGISEL-NEXT: s_brev_b32 s1, 1 +; GFX12DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX12DAGISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12DAGISEL-NEXT: v_add_f32_e64 v3, s1, s3 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX12DAGISEL-NEXT: ; %bb.2: +; GFX12DAGISEL-NEXT: s_wait_alu 0xf1ff +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX12DAGISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call float @llvm.amdgcn.wave.reduce.fadd(float %id.x, i32 1) + store float %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) { +; GFX8DAGISEL-LABEL: divergent_cfg_float: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX8DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX8DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[1:2], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg_float: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX8GISEL-NEXT: .LBB8_4: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg_float: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX9DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg_float: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9GISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9GISEL-NEXT: .LBB8_4: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg_float: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX1064DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1064DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg_float: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1064GISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg_float: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg_float: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s0, s3 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg_float: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1164DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1164DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg_float: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164GISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1164GISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg_float: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1132DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg_float: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s0, s3 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: divergent_cfg_float: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX12DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX12DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX12DAGISEL-NEXT: ; %bb.1: ; %else +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mul_f32_e32 v0, s0, v0 +; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX12DAGISEL-NEXT: ; %bb.3: ; %if +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mul_f32_e32 v0, s1, v0 +; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12DAGISEL-NEXT: s_wait_alu 0xf1ff +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX12DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call float @llvm.amdgcn.wave.reduce.fadd(float %in2, i32 1) + br label %endif + +else: + %reducedValIn = call float @llvm.amdgcn.wave.reduce.fadd(float %in, i32 1) + br label %endif + +endif: + %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else] + store float %combine, ptr addrspace(1) %out + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX10DAGISEL: {{.*}} ; GFX10GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll index fab269ea8cfb9..50ec375d04626 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll @@ -2220,6 +2220,1007 @@ endif: store i64 %combine, ptr addrspace(1) %out ret void } + +define amdgpu_kernel void @uniform_value_float(ptr addrspace(1) %out, float %in) { +; GFX8DAGISEL-LABEL: uniform_value_float: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_float: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_float: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_float: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value_float: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value_float: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value_float: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value_float: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_float: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_float: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_float: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_float: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: uniform_value_float: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mul_f32_e64 v0, -s2, v0 +; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12DAGISEL-NEXT: s_wait_alu 0xf1ff +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %result = call float @llvm.amdgcn.wave.reduce.fsub(float %in, i32 1) + store float %result, ptr addrspace(1) %out + ret void +} + +define void @divergent_value_float(ptr addrspace(1) %out, float %id.x) { +; GFX8DAGISEL-LABEL: divergent_value_float: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8DAGISEL-NEXT: v_sub_f32_e32 v3, s6, v3 +; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8DAGISEL-NEXT: ; %bb.2: +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8GISEL-LABEL: divergent_value_float: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX8GISEL-NEXT: s_mov_b32 s6, 0 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX8GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX8GISEL-NEXT: v_sub_f32_e32 v3, s6, v3 +; GFX8GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX8GISEL-NEXT: ; %bb.2: +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX8GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9DAGISEL-LABEL: divergent_value_float: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9DAGISEL-NEXT: v_sub_f32_e32 v3, s6, v3 +; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9DAGISEL-NEXT: ; %bb.2: +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9DAGISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GISEL-LABEL: divergent_value_float: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX9GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX9GISEL-NEXT: v_sub_f32_e32 v3, s6, v3 +; GFX9GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX9GISEL-NEXT: ; %bb.2: +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9GISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064DAGISEL-LABEL: divergent_value_float: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064DAGISEL-NEXT: v_sub_f32_e64 v3, s6, s8 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064DAGISEL-NEXT: ; %bb.2: +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064DAGISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX1064DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1064GISEL-LABEL: divergent_value_float: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec +; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] +; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s7 +; GFX1064GISEL-NEXT: s_bitset0_b64 s[4:5], s7 +; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064GISEL-NEXT: v_sub_f32_e64 v3, s6, s8 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v3 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1064GISEL-NEXT: ; %bb.2: +; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX1064GISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX1064GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032DAGISEL-LABEL: divergent_value_float: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032DAGISEL-NEXT: s_mov_b32 s5, 0 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032DAGISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032DAGISEL-NEXT: v_sub_f32_e64 v3, s5, s7 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032DAGISEL-NEXT: ; %bb.2: +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032DAGISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX1032DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1032GISEL-LABEL: divergent_value_float: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mov_b32 s4, exec_lo +; GFX1032GISEL-NEXT: s_mov_b32 s5, 0 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: s_ff1_i32_b32 s6, s4 +; GFX1032GISEL-NEXT: v_readlane_b32 s7, v2, s6 +; GFX1032GISEL-NEXT: s_bitset0_b32 s4, s6 +; GFX1032GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX1032GISEL-NEXT: v_sub_f32_e64 v3, s5, s7 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1032GISEL-NEXT: ; %bb.2: +; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX1032GISEL-NEXT: global_store_dword v[0:1], v2, off +; GFX1032GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164DAGISEL-LABEL: divergent_value_float: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164DAGISEL-NEXT: s_mov_b32 s2, 0 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164DAGISEL-NEXT: v_sub_f32_e64 v3, s2, s4 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164DAGISEL-NEXT: ; %bb.2: +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX1164DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1164GISEL-LABEL: divergent_value_float: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec +; GFX1164GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s3 +; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s3 +; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1164GISEL-NEXT: v_sub_f32_e64 v3, s2, s4 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s2, v3 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1164GISEL-NEXT: ; %bb.2: +; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX1164GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132DAGISEL-LABEL: divergent_value_float: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132DAGISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132DAGISEL-NEXT: v_sub_f32_e64 v3, s1, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132DAGISEL-NEXT: ; %bb.2: +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132DAGISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX1132DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX1132GISEL-LABEL: divergent_value_float: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1132GISEL-NEXT: s_mov_b32 s1, 0 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX1132GISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX1132GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1132GISEL-NEXT: v_sub_f32_e64 v3, s1, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX1132GISEL-NEXT: ; %bb.2: +; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX1132GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX1132GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12DAGISEL-LABEL: divergent_value_float: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12DAGISEL-NEXT: s_wait_expcnt 0x0 +; GFX12DAGISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0 +; GFX12DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s2, s0 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v2, s2 +; GFX12DAGISEL-NEXT: s_bitset0_b32 s0, s2 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX12DAGISEL-NEXT: v_sub_f32_e64 v3, s1, s3 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v3 +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 +; GFX12DAGISEL-NEXT: ; %bb.2: +; GFX12DAGISEL-NEXT: s_wait_alu 0xf1ff +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX12DAGISEL-NEXT: global_store_b32 v[0:1], v2, off +; GFX12DAGISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %result = call float @llvm.amdgcn.wave.reduce.fsub(float %id.x, i32 1) + store float %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_cfg_float(ptr addrspace(1) %out, float %in, float %in2) { +; GFX8DAGISEL-LABEL: divergent_cfg_float: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX8DAGISEL-NEXT: ; %bb.1: ; %else +; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX8DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX8DAGISEL-NEXT: ; %bb.3: ; %if +; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX8DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX8DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX8DAGISEL-NEXT: flat_store_dword v[1:2], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_cfg_float: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX8GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX8GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX8GISEL-NEXT: ; %bb.1: ; %else +; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow +; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX8GISEL-NEXT: ; %bb.3: ; %if +; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX8GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX8GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX8GISEL-NEXT: .LBB8_4: ; %endif +; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_cfg_float: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX9DAGISEL-NEXT: ; %bb.1: ; %else +; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX9DAGISEL-NEXT: ; %bb.3: ; %if +; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX9DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX9DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_cfg_float: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX9GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX9GISEL-NEXT: ; %bb.1: ; %else +; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow +; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX9GISEL-NEXT: ; %bb.3: ; %if +; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX9GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX9GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX9GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9GISEL-NEXT: .LBB8_4: ; %endif +; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_cfg_float: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX1064DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1064DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX1064DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_cfg_float: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX1064GISEL-NEXT: v_cmp_le_u32_e32 vcc, 16, v0 +; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1064GISEL-NEXT: ; %bb.1: ; %else +; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow +; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1064GISEL-NEXT: ; %bb.3: ; %if +; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX1064GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1064GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX1064GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif +; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_cfg_float: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 15, v0 +; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1032DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX1032DAGISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_cfg_float: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX1032GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v0 +; GFX1032GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1032GISEL-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1032GISEL-NEXT: ; %bb.1: ; %else +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s0, s3 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1032GISEL-NEXT: ; %bb.3: ; %if +; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032GISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX1032GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif +; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_cfg_float: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1164DAGISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1164DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1164DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_cfg_float: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: ; implicit-def: $sgpr6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1164GISEL-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164GISEL-NEXT: ; %bb.1: ; %else +; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164GISEL-NEXT: v_cvt_f32_i32_e32 v0, s6 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow +; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[2:3] +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1164GISEL-NEXT: ; %bb.3: ; %if +; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GFX1164GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif +; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_cfg_float: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX1132DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if +; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX1132DAGISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX1132DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_cfg_float: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: ; implicit-def: $sgpr2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 +; GFX1132GISEL-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132GISEL-NEXT: ; %bb.1: ; %else +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s0, s3 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX1132GISEL-NEXT: ; %bb.3: ; %if +; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132GISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX1132GISEL-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif +; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: divergent_cfg_float: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c +; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX12DAGISEL-NEXT: ; implicit-def: $sgpr3 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 +; GFX12DAGISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_2 +; GFX12DAGISEL-NEXT: ; %bb.1: ; %else +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mul_f32_e64 v0, -s0, v0 +; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s3, v0 +; GFX12DAGISEL-NEXT: .LBB8_2: ; %Flow +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_4 +; GFX12DAGISEL-NEXT: ; %bb.3: ; %if +; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX12DAGISEL-NEXT: s_wait_alu 0xfffe +; GFX12DAGISEL-NEXT: v_cvt_f32_i32_e32 v0, s2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mul_f32_e64 v0, -s1, v0 +; GFX12DAGISEL-NEXT: v_readfirstlane_b32 s1, v0 +; GFX12DAGISEL-NEXT: s_wait_alu 0xf1ff +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, s1 +; GFX12DAGISEL-NEXT: .LBB8_4: ; %endif +; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %d_cmp = icmp ult i32 %tid, 16 + br i1 %d_cmp, label %if, label %else + +if: + %reducedValTid = call float @llvm.amdgcn.wave.reduce.fsub(float %in2, i32 1) + br label %endif + +else: + %reducedValIn = call float @llvm.amdgcn.wave.reduce.fsub(float %in, i32 1) + br label %endif + +endif: + %combine = phi float [%reducedValTid, %if], [%reducedValIn, %else] + store float %combine, ptr addrspace(1) %out + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX10DAGISEL: {{.*}} ; GFX10GISEL: {{.*}} diff --git a/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll b/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll index 67549599db2f3..aa67a20ab08a7 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @and_not_combine_v32i8(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { ; CHECK-LABEL: and_not_combine_v32i8: @@ -85,3 +85,425 @@ entry: store <4 x i64> %and, ptr %res ret void } + +define void @pre_not_and_not_combine_v32i8(ptr %res, ptr %a, i8 %b) nounwind { +; CHECK-LABEL: pre_not_and_not_combine_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: nor $a1, $a2, $zero +; CHECK-NEXT: xvreplgr2vr.b $xr1, $a1 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <32 x i8>, ptr %a + %b.not = xor i8 %b, -1 + %b.not.ele = insertelement <32 x i8> poison, i8 %b.not, i64 0 + %v1.not = shufflevector <32 x i8> %b.not.ele, <32 x i8> poison, <32 x i32> zeroinitializer + %v0.not = xor <32 x i8> %v0, splat (i8 -1) + %and = and <32 x i8> %v0.not, %v1.not + store <32 x i8> %and, ptr %res + ret void +} + +define void @post_not_and_not_combine_v32i8(ptr %res, ptr %a, i8 %b) nounwind { +; CHECK-LABEL: post_not_and_not_combine_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvreplgr2vr.b $xr1, $a2 +; CHECK-NEXT: xvxori.b $xr1, $xr1, 255 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <32 x i8>, ptr %a + %b.ele = insertelement <32 x i8> poison, i8 %b, i64 0 + %v1 = shufflevector <32 x i8> %b.ele, <32 x i8> poison, <32 x i32> zeroinitializer + %v0.not = xor <32 x i8> %v0, splat (i8 -1) + %v1.not = xor <32 x i8> %v1, splat (i8 -1) + %and = and <32 x i8> %v0.not, %v1.not + store <32 x i8> %and, ptr %res + ret void +} + +define void @pre_not_and_not_combine_v16i16(ptr %res, ptr %a, i16 %b) nounwind { +; CHECK-LABEL: pre_not_and_not_combine_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: nor $a1, $a2, $zero +; CHECK-NEXT: xvreplgr2vr.h $xr1, $a1 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <16 x i16>, ptr %a + %b.not = xor i16 %b, -1 + %b.not.ele = insertelement <16 x i16> poison, i16 %b.not, i64 0 + %v1.not = shufflevector <16 x i16> %b.not.ele, <16 x i16> poison, <16 x i32> zeroinitializer + %v0.not = xor <16 x i16> %v0, splat (i16 -1) + %and = and <16 x i16> %v0.not, %v1.not + store <16 x i16> %and, ptr %res + ret void +} + +define void @post_not_and_not_combine_v16i16(ptr %res, ptr %a, i16 %b) nounwind { +; CHECK-LABEL: post_not_and_not_combine_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvreplgr2vr.h $xr1, $a2 +; CHECK-NEXT: xvrepli.b $xr2, -1 +; CHECK-NEXT: xvxor.v $xr1, $xr1, $xr2 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <16 x i16>, ptr %a + %b.ele = insertelement <16 x i16> poison, i16 %b, i64 0 + %v1 = shufflevector <16 x i16> %b.ele, <16 x i16> poison, <16 x i32> zeroinitializer + %v0.not = xor <16 x i16> %v0, splat (i16 -1) + %v1.not = xor <16 x i16> %v1, splat (i16 -1) + %and = and <16 x i16> %v0.not, %v1.not + store <16 x i16> %and, ptr %res + ret void +} + +define void @pre_not_and_not_combine_v8i32(ptr %res, ptr %a, i32 %b) nounwind { +; CHECK-LABEL: pre_not_and_not_combine_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: nor $a1, $a2, $zero +; CHECK-NEXT: xvreplgr2vr.w $xr1, $a1 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <8 x i32>, ptr %a + %b.not = xor i32 %b, -1 + %b.not.ele = insertelement <8 x i32> poison, i32 %b.not, i64 0 + %v1.not = shufflevector <8 x i32> %b.not.ele, <8 x i32> poison, <8 x i32> zeroinitializer + %v0.not = xor <8 x i32> %v0, splat (i32 -1) + %and = and <8 x i32> %v0.not, %v1.not + store <8 x i32> %and, ptr %res + ret void +} + +define void @post_not_and_not_combine_v8i32(ptr %res, ptr %a, i32 %b) nounwind { +; CHECK-LABEL: post_not_and_not_combine_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvreplgr2vr.w $xr1, $a2 +; CHECK-NEXT: xvrepli.b $xr2, -1 +; CHECK-NEXT: xvxor.v $xr1, $xr1, $xr2 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <8 x i32>, ptr %a + %b.ele = insertelement <8 x i32> poison, i32 %b, i64 0 + %v1 = shufflevector <8 x i32> %b.ele, <8 x i32> poison, <8 x i32> zeroinitializer + %v0.not = xor <8 x i32> %v0, splat (i32 -1) + %v1.not = xor <8 x i32> %v1, splat (i32 -1) + %and = and <8 x i32> %v0.not, %v1.not + store <8 x i32> %and, ptr %res + ret void +} + +define void @pre_not_and_not_combine_v4i64(ptr %res, ptr %a, i64 %b) nounwind { +; LA32-LABEL: pre_not_and_not_combine_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: nor $a1, $a3, $zero +; LA32-NEXT: nor $a2, $a2, $zero +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA32-NEXT: xvreplve0.d $xr1, $xr1 +; LA32-NEXT: xvandn.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: pre_not_and_not_combine_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: nor $a1, $a2, $zero +; LA64-NEXT: xvreplgr2vr.d $xr1, $a1 +; LA64-NEXT: xvandn.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret + %v0 = load <4 x i64>, ptr %a + %b.not = xor i64 %b, -1 + %b.not.ele = insertelement <4 x i64> poison, i64 %b.not, i64 0 + %v1.not = shufflevector <4 x i64> %b.not.ele, <4 x i64> poison, <4 x i32> zeroinitializer + %v0.not = xor <4 x i64> %v0, splat (i64 -1) + %and = and <4 x i64> %v0.not, %v1.not + store <4 x i64> %and, ptr %res + ret void +} + +define void @post_not_and_not_combine_v4i64(ptr %res, ptr %a, i64 %b) nounwind { +; LA32-LABEL: post_not_and_not_combine_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a1, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA32-NEXT: xvreplve0.d $xr1, $xr1 +; LA32-NEXT: xvrepli.b $xr2, -1 +; LA32-NEXT: xvxor.v $xr1, $xr1, $xr2 +; LA32-NEXT: xvandn.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: post_not_and_not_combine_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a1, 0 +; LA64-NEXT: xvreplgr2vr.d $xr1, $a2 +; LA64-NEXT: xvrepli.b $xr2, -1 +; LA64-NEXT: xvxor.v $xr1, $xr1, $xr2 +; LA64-NEXT: xvandn.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a0, 0 +; LA64-NEXT: ret + %v0 = load <4 x i64>, ptr %a + %b.ele = insertelement <4 x i64> poison, i64 %b, i64 0 + %v1 = shufflevector <4 x i64> %b.ele, <4 x i64> poison, <4 x i32> zeroinitializer + %v0.not = xor <4 x i64> %v0, splat (i64 -1) + %v1.not = xor <4 x i64> %v1, splat (i64 -1) + %and = and <4 x i64> %v0.not, %v1.not + store <4 x i64> %and, ptr %res + ret void +} + +define void @and_not_combine_splatimm_v32i8(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: and_not_combine_splatimm_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvrepli.b $xr1, -4 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <32 x i8>, ptr %a0 + %and = and <32 x i8> %v0, splat (i8 -4) + %xor = xor <32 x i8> %and, splat (i8 -4) + store <32 x i8> %xor, ptr %res + ret void +} + +define void @and_not_combine_splatimm_v16i16(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: and_not_combine_splatimm_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvrepli.h $xr1, -4 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <16 x i16>, ptr %a0 + %and = and <16 x i16> %v0, splat (i16 -4) + %xor = xor <16 x i16> %and, splat (i16 -4) + store <16 x i16> %xor, ptr %res + ret void +} + +define void @and_not_combine_splatimm_v8i32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: and_not_combine_splatimm_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvrepli.w $xr1, -4 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <8 x i32>, ptr %a0 + %and = and <8 x i32> %v0, splat (i32 -4) + %xor = xor <8 x i32> %and, splat (i32 -4) + store <8 x i32> %xor, ptr %res + ret void +} + +define void @and_not_combine_splatimm_v4i64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: and_not_combine_splatimm_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvrepli.d $xr1, -4 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <4 x i64>, ptr %a0 + %and = and <4 x i64> %v0, splat (i64 -4) + %xor = xor <4 x i64> %and, splat (i64 -4) + store <4 x i64> %xor, ptr %res + ret void +} + +define void @and_or_not_combine_v32i8(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind { +; CHECK-LABEL: and_or_not_combine_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvld $xr2, $a1, 0 +; CHECK-NEXT: xvseq.b $xr0, $xr1, $xr0 +; CHECK-NEXT: xvxori.b $xr0, $xr0, 255 +; CHECK-NEXT: xvseq.b $xr1, $xr1, $xr2 +; CHECK-NEXT: xvorn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvandi.b $xr0, $xr0, 4 +; CHECK-NEXT: xvst $xr0, $a3, 0 +; CHECK-NEXT: ret + %a = load <32 x i8>, ptr %pa + %b = load <32 x i8>, ptr %pb + %v = load <32 x i8>, ptr %pv + %ca = icmp ne <32 x i8> %v, %a + %cb = icmp ne <32 x i8> %v, %b + %or = or <32 x i1> %ca, %cb + %ext = sext <32 x i1> %or to <32 x i8> + %and = and <32 x i8> %ext, splat (i8 4) + store <32 x i8> %and, ptr %dst + ret void +} + +define void @and_or_not_combine_v16i16(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind { +; CHECK-LABEL: and_or_not_combine_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvld $xr2, $a1, 0 +; CHECK-NEXT: xvseq.h $xr0, $xr1, $xr0 +; CHECK-NEXT: xvrepli.b $xr3, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr3 +; CHECK-NEXT: xvseq.h $xr1, $xr1, $xr2 +; CHECK-NEXT: xvorn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvrepli.h $xr1, 4 +; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a3, 0 +; CHECK-NEXT: ret + %a = load <16 x i16>, ptr %pa + %b = load <16 x i16>, ptr %pb + %v = load <16 x i16>, ptr %pv + %ca = icmp ne <16 x i16> %v, %a + %cb = icmp ne <16 x i16> %v, %b + %or = or <16 x i1> %ca, %cb + %ext = sext <16 x i1> %or to <16 x i16> + %and = and <16 x i16> %ext, splat (i16 4) + store <16 x i16> %and, ptr %dst + ret void +} + +define void @and_or_not_combine_v8i32(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind { +; CHECK-LABEL: and_or_not_combine_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvld $xr2, $a1, 0 +; CHECK-NEXT: xvseq.w $xr0, $xr1, $xr0 +; CHECK-NEXT: xvrepli.b $xr3, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr3 +; CHECK-NEXT: xvseq.w $xr1, $xr1, $xr2 +; CHECK-NEXT: xvorn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvrepli.w $xr1, 4 +; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a3, 0 +; CHECK-NEXT: ret + %a = load <8 x i32>, ptr %pa + %b = load <8 x i32>, ptr %pb + %v = load <8 x i32>, ptr %pv + %ca = icmp ne <8 x i32> %v, %a + %cb = icmp ne <8 x i32> %v, %b + %or = or <8 x i1> %ca, %cb + %ext = sext <8 x i1> %or to <8 x i32> + %and = and <8 x i32> %ext, splat (i32 4) + store <8 x i32> %and, ptr %dst + ret void +} + +define void @and_or_not_combine_v4i64(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind { +; CHECK-LABEL: and_or_not_combine_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvld $xr2, $a1, 0 +; CHECK-NEXT: xvseq.d $xr0, $xr1, $xr0 +; CHECK-NEXT: xvrepli.b $xr3, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr3 +; CHECK-NEXT: xvseq.d $xr1, $xr1, $xr2 +; CHECK-NEXT: xvorn.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvrepli.d $xr1, 4 +; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a3, 0 +; CHECK-NEXT: ret + %a = load <4 x i64>, ptr %pa + %b = load <4 x i64>, ptr %pb + %v = load <4 x i64>, ptr %pv + %ca = icmp ne <4 x i64> %v, %a + %cb = icmp ne <4 x i64> %v, %b + %or = or <4 x i1> %ca, %cb + %ext = sext <4 x i1> %or to <4 x i64> + %and = and <4 x i64> %ext, splat (i64 4) + store <4 x i64> %and, ptr %dst + ret void +} + +define void @and_extract_subvector_not_combine_v32i8(ptr %pa, ptr %dst) nounwind { +; CHECK-LABEL: and_extract_subvector_not_combine_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvxori.b $xr0, $xr0, 255 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vandi.b $vr0, $vr0, 4 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %a = load volatile <32 x i8>, ptr %pa + %a.not = xor <32 x i8> %a, splat (i8 -1) + %subv = shufflevector <32 x i8> %a.not, <32 x i8> poison, + <16 x i32> + %and = and <16 x i8> %subv, splat (i8 4) + store <16 x i8> %and, ptr %dst + ret void +} + +define void @and_extract_subvector_not_combine_v16i16(ptr %pa, ptr %dst) nounwind { +; CHECK-LABEL: and_extract_subvector_not_combine_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vrepli.h $vr1, 4 +; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %a = load volatile <16 x i16>, ptr %pa + %a.not = xor <16 x i16> %a, splat (i16 -1) + %subv = shufflevector <16 x i16> %a.not, <16 x i16> poison, + <8 x i32> + %and = and <8 x i16> %subv, splat (i16 4) + store <8 x i16> %and, ptr %dst + ret void +} + +define void @and_extract_subvector_not_combine_v8i32(ptr %pa, ptr %dst) nounwind { +; CHECK-LABEL: and_extract_subvector_not_combine_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vrepli.w $vr1, 4 +; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %a = load volatile <8 x i32>, ptr %pa + %a.not = xor <8 x i32> %a, splat (i32 -1) + %subv = shufflevector <8 x i32> %a.not, <8 x i32> poison, <4 x i32> + %and = and <4 x i32> %subv, splat (i32 4) + store <4 x i32> %and, ptr %dst + ret void +} + +define void @and_extract_subvector_not_combine_v4i64(ptr %pa, ptr %dst) nounwind { +; CHECK-LABEL: and_extract_subvector_not_combine_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: vrepli.d $vr1, 4 +; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %a = load volatile <4 x i64>, ptr %pa + %a.not = xor <4 x i64> %a, splat (i64 -1) + %subv = shufflevector <4 x i64> %a.not, <4 x i64> poison, <2 x i32> + %and = and <2 x i64> %subv, splat (i64 4) + store <2 x i64> %and, ptr %dst + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll b/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll index 3c6d34505e114..960d8c4b156b5 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @and_not_combine_v16i8(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { ; CHECK-LABEL: and_not_combine_v16i8: @@ -85,3 +85,348 @@ entry: store <2 x i64> %and, ptr %res ret void } + +define void @pre_not_and_not_combine_v16i8(ptr %res, ptr %a, i8 %b) nounwind { +; CHECK-LABEL: pre_not_and_not_combine_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: nor $a1, $a2, $zero +; CHECK-NEXT: vreplgr2vr.b $vr1, $a1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <16 x i8>, ptr %a + %b.not = xor i8 %b, -1 + %b.not.ele = insertelement <16 x i8> poison, i8 %b.not, i64 0 + %v1.not = shufflevector <16 x i8> %b.not.ele, <16 x i8> poison, <16 x i32> zeroinitializer + %v0.not = xor <16 x i8> %v0, splat (i8 -1) + %and = and <16 x i8> %v0.not, %v1.not + store <16 x i8> %and, ptr %res + ret void +} + +define void @post_not_and_not_combine_v16i8(ptr %res, ptr %a, i8 %b) nounwind { +; CHECK-LABEL: post_not_and_not_combine_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplgr2vr.b $vr1, $a2 +; CHECK-NEXT: vxori.b $vr1, $vr1, 255 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <16 x i8>, ptr %a + %b.ele = insertelement <16 x i8> poison, i8 %b, i64 0 + %v1 = shufflevector <16 x i8> %b.ele, <16 x i8> poison, <16 x i32> zeroinitializer + %v0.not = xor <16 x i8> %v0, splat (i8 -1) + %v1.not = xor <16 x i8> %v1, splat (i8 -1) + %and = and <16 x i8> %v0.not, %v1.not + store <16 x i8> %and, ptr %res + ret void +} + +define void @pre_not_and_not_combine_v8i16(ptr %res, ptr %a, i16 %b) nounwind { +; CHECK-LABEL: pre_not_and_not_combine_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: nor $a1, $a2, $zero +; CHECK-NEXT: vreplgr2vr.h $vr1, $a1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <8 x i16>, ptr %a + %b.not = xor i16 %b, -1 + %b.not.ele = insertelement <8 x i16> poison, i16 %b.not, i64 0 + %v1.not = shufflevector <8 x i16> %b.not.ele, <8 x i16> poison, <8 x i32> zeroinitializer + %v0.not = xor <8 x i16> %v0, splat (i16 -1) + %and = and <8 x i16> %v0.not, %v1.not + store <8 x i16> %and, ptr %res + ret void +} + +define void @post_not_and_not_combine_v8i16(ptr %res, ptr %a, i16 %b) nounwind { +; CHECK-LABEL: post_not_and_not_combine_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplgr2vr.h $vr1, $a2 +; CHECK-NEXT: vrepli.b $vr2, -1 +; CHECK-NEXT: vxor.v $vr1, $vr1, $vr2 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <8 x i16>, ptr %a + %b.ele = insertelement <8 x i16> poison, i16 %b, i64 0 + %v1 = shufflevector <8 x i16> %b.ele, <8 x i16> poison, <8 x i32> zeroinitializer + %v0.not = xor <8 x i16> %v0, splat (i16 -1) + %v1.not = xor <8 x i16> %v1, splat (i16 -1) + %and = and <8 x i16> %v0.not, %v1.not + store <8 x i16> %and, ptr %res + ret void +} + +define void @pre_not_and_not_combine_v4i32(ptr %res, ptr %a, i32 %b) nounwind { +; CHECK-LABEL: pre_not_and_not_combine_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: nor $a1, $a2, $zero +; CHECK-NEXT: vreplgr2vr.w $vr1, $a1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <4 x i32>, ptr %a + %b.not = xor i32 %b, -1 + %b.not.ele = insertelement <4 x i32> poison, i32 %b.not, i64 0 + %v1.not = shufflevector <4 x i32> %b.not.ele, <4 x i32> poison, <4 x i32> zeroinitializer + %v0.not = xor <4 x i32> %v0, splat (i32 -1) + %and = and <4 x i32> %v0.not, %v1.not + store <4 x i32> %and, ptr %res + ret void +} + +define void @post_not_and_not_combine_v4i32(ptr %res, ptr %a, i32 %b) nounwind { +; CHECK-LABEL: post_not_and_not_combine_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vreplgr2vr.w $vr1, $a2 +; CHECK-NEXT: vrepli.b $vr2, -1 +; CHECK-NEXT: vxor.v $vr1, $vr1, $vr2 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <4 x i32>, ptr %a + %b.ele = insertelement <4 x i32> poison, i32 %b, i64 0 + %v1 = shufflevector <4 x i32> %b.ele, <4 x i32> poison, <4 x i32> zeroinitializer + %v0.not = xor <4 x i32> %v0, splat (i32 -1) + %v1.not = xor <4 x i32> %v1, splat (i32 -1) + %and = and <4 x i32> %v0.not, %v1.not + store <4 x i32> %and, ptr %res + ret void +} + +define void @pre_not_and_not_combine_v2i64(ptr %res, ptr %a, i64 %b) nounwind { +; LA32-LABEL: pre_not_and_not_combine_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: nor $a1, $a3, $zero +; LA32-NEXT: nor $a2, $a2, $zero +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a1, 1 +; LA32-NEXT: vreplvei.d $vr1, $vr1, 0 +; LA32-NEXT: vandn.v $vr0, $vr0, $vr1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: pre_not_and_not_combine_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: nor $a1, $a2, $zero +; LA64-NEXT: vreplgr2vr.d $vr1, $a1 +; LA64-NEXT: vandn.v $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret + %v0 = load <2 x i64>, ptr %a + %b.not = xor i64 %b, -1 + %b.not.ele = insertelement <2 x i64> poison, i64 %b.not, i64 0 + %v1.not = shufflevector <2 x i64> %b.not.ele, <2 x i64> poison, <2 x i32> zeroinitializer + %v0.not = xor <2 x i64> %v0, splat (i64 -1) + %and = and <2 x i64> %v0.not, %v1.not + store <2 x i64> %and, ptr %res + ret void +} + +define void @post_not_and_not_combine_v2i64(ptr %res, ptr %a, i64 %b) nounwind { +; LA32-LABEL: post_not_and_not_combine_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a1, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 1 +; LA32-NEXT: vreplvei.d $vr1, $vr1, 0 +; LA32-NEXT: vrepli.b $vr2, -1 +; LA32-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA32-NEXT: vandn.v $vr0, $vr0, $vr1 +; LA32-NEXT: vst $vr0, $a0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: post_not_and_not_combine_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a1, 0 +; LA64-NEXT: vreplgr2vr.d $vr1, $a2 +; LA64-NEXT: vrepli.b $vr2, -1 +; LA64-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA64-NEXT: vandn.v $vr0, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a0, 0 +; LA64-NEXT: ret + %v0 = load <2 x i64>, ptr %a + %b.ele = insertelement <2 x i64> poison, i64 %b, i64 0 + %v1 = shufflevector <2 x i64> %b.ele, <2 x i64> poison, <2 x i32> zeroinitializer + %v0.not = xor <2 x i64> %v0, splat (i64 -1) + %v1.not = xor <2 x i64> %v1, splat (i64 -1) + %and = and <2 x i64> %v0.not, %v1.not + store <2 x i64> %and, ptr %res + ret void +} + +define void @and_not_combine_splatimm_v16i8(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: and_not_combine_splatimm_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vrepli.b $vr1, -4 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <16 x i8>, ptr %a0 + %and = and <16 x i8> %v0, splat (i8 -4) + %xor = xor <16 x i8> %and, splat (i8 -4) + store <16 x i8> %xor, ptr %res + ret void +} + +define void @and_not_combine_splatimm_v8i16(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: and_not_combine_splatimm_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vrepli.h $vr1, -4 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <8 x i16>, ptr %a0 + %and = and <8 x i16> %v0, splat (i16 -4) + %xor = xor <8 x i16> %and, splat (i16 -4) + store <8 x i16> %xor, ptr %res + ret void +} + +define void @and_not_combine_splatimm_v4i32(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: and_not_combine_splatimm_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vrepli.w $vr1, -4 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <4 x i32>, ptr %a0 + %and = and <4 x i32> %v0, splat (i32 -4) + %xor = xor <4 x i32> %and, splat (i32 -4) + store <4 x i32> %xor, ptr %res + ret void +} + +define void @and_not_combine_splatimm_v2i64(ptr %res, ptr %a0) nounwind { +; CHECK-LABEL: and_not_combine_splatimm_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vrepli.d $vr1, -4 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret + %v0 = load <2 x i64>, ptr %a0 + %and = and <2 x i64> %v0, splat (i64 -4) + %xor = xor <2 x i64> %and, splat (i64 -4) + store <2 x i64> %xor, ptr %res + ret void +} + +define void @and_or_not_combine_v16i8(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind { +; CHECK-LABEL: and_or_not_combine_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vld $vr2, $a1, 0 +; CHECK-NEXT: vseq.b $vr0, $vr1, $vr0 +; CHECK-NEXT: vxori.b $vr0, $vr0, 255 +; CHECK-NEXT: vseq.b $vr1, $vr1, $vr2 +; CHECK-NEXT: vorn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vandi.b $vr0, $vr0, 4 +; CHECK-NEXT: vst $vr0, $a3, 0 +; CHECK-NEXT: ret + %a = load <16 x i8>, ptr %pa + %b = load <16 x i8>, ptr %pb + %v = load <16 x i8>, ptr %pv + %ca = icmp ne <16 x i8> %v, %a + %cb = icmp ne <16 x i8> %v, %b + %or = or <16 x i1> %ca, %cb + %ext = sext <16 x i1> %or to <16 x i8> + %and = and <16 x i8> %ext, splat (i8 4) + store <16 x i8> %and, ptr %dst + ret void +} + +define void @and_or_not_combine_v8i16(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind { +; CHECK-LABEL: and_or_not_combine_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vld $vr2, $a1, 0 +; CHECK-NEXT: vseq.h $vr0, $vr1, $vr0 +; CHECK-NEXT: vrepli.b $vr3, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr3 +; CHECK-NEXT: vseq.h $vr1, $vr1, $vr2 +; CHECK-NEXT: vorn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vrepli.h $vr1, 4 +; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a3, 0 +; CHECK-NEXT: ret + %a = load <8 x i16>, ptr %pa + %b = load <8 x i16>, ptr %pb + %v = load <8 x i16>, ptr %pv + %ca = icmp ne <8 x i16> %v, %a + %cb = icmp ne <8 x i16> %v, %b + %or = or <8 x i1> %ca, %cb + %ext = sext <8 x i1> %or to <8 x i16> + %and = and <8 x i16> %ext, splat (i16 4) + store <8 x i16> %and, ptr %dst + ret void +} + +define void @and_or_not_combine_v4i32(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind { +; CHECK-LABEL: and_or_not_combine_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vld $vr2, $a1, 0 +; CHECK-NEXT: vseq.w $vr0, $vr1, $vr0 +; CHECK-NEXT: vrepli.b $vr3, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr3 +; CHECK-NEXT: vseq.w $vr1, $vr1, $vr2 +; CHECK-NEXT: vorn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vrepli.w $vr1, 4 +; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a3, 0 +; CHECK-NEXT: ret + %a = load <4 x i32>, ptr %pa + %b = load <4 x i32>, ptr %pb + %v = load <4 x i32>, ptr %pv + %ca = icmp ne <4 x i32> %v, %a + %cb = icmp ne <4 x i32> %v, %b + %or = or <4 x i1> %ca, %cb + %ext = sext <4 x i1> %or to <4 x i32> + %and = and <4 x i32> %ext, splat (i32 4) + store <4 x i32> %and, ptr %dst + ret void +} + +define void @and_or_not_combine_v2i64(ptr %pa, ptr %pb, ptr %pv, ptr %dst) nounwind { +; CHECK-LABEL: and_or_not_combine_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vld $vr2, $a1, 0 +; CHECK-NEXT: vseq.d $vr0, $vr1, $vr0 +; CHECK-NEXT: vrepli.b $vr3, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr3 +; CHECK-NEXT: vseq.d $vr1, $vr1, $vr2 +; CHECK-NEXT: vorn.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vrepli.d $vr1, 4 +; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a3, 0 +; CHECK-NEXT: ret + %a = load <2 x i64>, ptr %pa + %b = load <2 x i64>, ptr %pb + %v = load <2 x i64>, ptr %pv + %ca = icmp ne <2 x i64> %v, %a + %cb = icmp ne <2 x i64> %v, %b + %or = or <2 x i1> %ca, %cb + %ext = sext <2 x i1> %or to <2 x i64> + %and = and <2 x i64> %ext, splat (i64 4) + store <2 x i64> %and, ptr %dst + ret void +} diff --git a/llvm/test/CodeGen/RISCV/remat.ll b/llvm/test/CodeGen/RISCV/remat.ll index 8490dd0877d30..8a252751165d0 100644 --- a/llvm/test/CodeGen/RISCV/remat.ll +++ b/llvm/test/CodeGen/RISCV/remat.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -O1 -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O1 -mtriple=riscv64 -mattr=+d,+zfh,+zfbfmin -verify-machineinstrs < %s | FileCheck %s @a = common global i32 0, align 4 @l = common global i32 0, align 4 @@ -200,3 +200,170 @@ for.end: ; preds = %for.inc, %entry } declare i32 @foo(i32, i32, i32, i32, i32, i32) + +define void @remat_load(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, double %8, double %9, double %10, double %11, double %12, double %13, double %14, double %15, i8 %stackarg0, i16 %stackarg1, i32 %stackarg2, i64 %stackarg3, half %stackarg4, bfloat %stackarg5, float %stackarg6, double %stackarg7, ptr %p) nounwind { +; CHECK-LABEL: remat_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -208 +; CHECK-NEXT: sd ra, 200(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 192(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 184(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 176(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 168(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 160(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 152(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 144(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s7, 136(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s8, 128(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s9, 120(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s10, 112(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s11, 104(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs0, 96(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs1, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs2, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs3, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs4, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs5, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs6, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs7, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs8, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs9, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs10, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: fsd fs11, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: fld fa5, 264(sp) +; CHECK-NEXT: flw fa4, 256(sp) +; CHECK-NEXT: flh fa3, 248(sp) +; CHECK-NEXT: flh fa2, 240(sp) +; CHECK-NEXT: ld a0, 272(sp) +; CHECK-NEXT: lbu a4, 208(sp) +; CHECK-NEXT: lh a3, 216(sp) +; CHECK-NEXT: lw a2, 224(sp) +; CHECK-NEXT: ld a1, 232(sp) +; CHECK-NEXT: sb a4, 0(a0) +; CHECK-NEXT: sh a3, 0(a0) +; CHECK-NEXT: sw a2, 0(a0) +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: fsh fa2, 0(a0) +; CHECK-NEXT: fsh fa3, 0(a0) +; CHECK-NEXT: fsw fa4, 0(a0) +; CHECK-NEXT: fsd fa5, 0(a0) +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ld a0, 272(sp) +; CHECK-NEXT: lbu a1, 208(sp) +; CHECK-NEXT: sb a1, 0(a0) +; CHECK-NEXT: lh a1, 216(sp) +; CHECK-NEXT: sh a1, 0(a0) +; CHECK-NEXT: lw a1, 224(sp) +; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: ld a1, 232(sp) +; CHECK-NEXT: sd a1, 0(a0) +; CHECK-NEXT: flh fa5, 240(sp) +; CHECK-NEXT: fsh fa5, 0(a0) +; CHECK-NEXT: flh fa5, 248(sp) +; CHECK-NEXT: fsh fa5, 0(a0) +; CHECK-NEXT: flw fa5, 256(sp) +; CHECK-NEXT: fsw fa5, 0(a0) +; CHECK-NEXT: fld fa5, 264(sp) +; CHECK-NEXT: fsd fa5, 0(a0) +; CHECK-NEXT: ld ra, 200(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 192(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 184(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 176(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 168(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 160(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 152(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 144(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s7, 136(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s8, 128(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s9, 120(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s10, 112(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s11, 104(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs0, 96(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs1, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs2, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs3, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs4, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs5, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs6, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs7, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs8, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs9, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs10, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: fld fs11, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 208 +; CHECK-NEXT: ret +entry: + ; Add a use of the stack arguments here so that we will have to load them from + ; the stack before the inline asm. Otherwise we would be exercising the + ; machine scheduler, not rematerialization. + store volatile i8 %stackarg0, ptr %p + store volatile i16 %stackarg1, ptr %p + store volatile i32 %stackarg2, ptr %p + store volatile i64 %stackarg3, ptr %p + store volatile half %stackarg4, ptr %p + store volatile bfloat %stackarg5, ptr %p + store volatile float %stackarg6, ptr %p + store volatile double %stackarg7, ptr %p + tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31},~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"() + ; Now use them after spilling everything to force rematerialization + store volatile i8 %stackarg0, ptr %p + store volatile i16 %stackarg1, ptr %p + store volatile i32 %stackarg2, ptr %p + store volatile i64 %stackarg3, ptr %p + store volatile half %stackarg4, ptr %p + store volatile bfloat %stackarg5, ptr %p + store volatile float %stackarg6, ptr %p + store volatile double %stackarg7, ptr %p + ret void +} + +; We could remat the load of the constant global if we extended the live +; interval of the high bits of the address. + +@const = external constant i32 +define i32 @constglobal_load() nounwind { +; CHECK-LABEL: constglobal_load: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -112 +; CHECK-NEXT: sd ra, 104(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 96(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s7, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s8, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s9, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s10, 16(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s11, 8(sp) # 8-byte Folded Spill +; CHECK-NEXT: lui a0, %hi(const) +; CHECK-NEXT: lw a0, %lo(const)(a0) +; CHECK-NEXT: sd a0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ld a0, 0(sp) # 8-byte Folded Reload +; CHECK-NEXT: addiw a0, a0, 1 +; CHECK-NEXT: ld ra, 104(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 96(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s7, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s8, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s9, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s10, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s11, 8(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 112 +; CHECK-NEXT: ret +entry: + %global = load i32, ptr @const + tail call void asm sideeffect "", "~{x1},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() + %a = add i32 %global, 1 + ret i32 %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll index ab9849631663c..01d66b344ec2e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll +++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll @@ -40,8 +40,6 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, poison, float %t0, i32 0 + %splat = shufflevector <4 x float> %insert, <4 x float> poison, <4 x i32> zeroinitializer + br label %for.cond + +for.cond: + %x.0 = phi <4 x float> [ %splat, %entry ], [ %sub, %for.body ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %t1 = load i32, ptr %n, align 4 + %cmp = icmp ne i32 %i.0, %t1 + br i1 %cmp, label %for.body, label %for.end + +for.body: + %t2 = extractelement <4 x float> %x.0, i32 1 + store volatile float %t2, ptr %inout, align 4 + %sub = fsub <4 x float> zeroinitializer, %x.0 + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret void +} + define float @extract_element_binop_splat_constant_index(<4 x float> %x) { ; ; CHECK-LABEL: @extract_element_binop_splat_constant_index( diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll index 2d48d20ba9c5c..220a4a29a3041 100644 --- a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll +++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll @@ -358,6 +358,7 @@ loop: exit: ret float %rdx.next } + define i32 @test_smin(ptr %src, i64 %n) { ; CHECK-LABEL: define i32 @test_smin( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { @@ -623,3 +624,56 @@ loop: exit: ret i32 %rdx.next } + +define <4 x i32> @test_vector_add(ptr %p, i64 %n, <4 x i32> %start) { +; CHECK-LABEL: define <4 x i32> @test_vector_add( +; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]], <4 x i32> [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_3:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi <4 x i32> [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load <4 x i32>, ptr [[GEP]], align 16 +; CHECK-NEXT: [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[L]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load <4 x i32>, ptr [[GEP_1]], align 16 +; CHECK-NEXT: [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[L_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load <4 x i32>, ptr [[GEP_2]], align 16 +; CHECK-NEXT: [[RDX_NEXT_2]] = add <4 x i32> [[RDX_2]], [[L_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[P]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_3:%.*]] = load <4 x i32>, ptr [[GEP_3]], align 16 +; CHECK-NEXT: [[RDX_NEXT_3]] = add <4 x i32> [[RDX_3]], [[L_3]] +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi <4 x i32> [ [[RDX_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]] +; CHECK-NEXT: [[BIN_RDX1:%.*]] = add <4 x i32> [[RDX_NEXT_2]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[RDX_NEXT_3]], [[BIN_RDX1]] +; CHECK-NEXT: ret <4 x i32> [[BIN_RDX2]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi <4 x i32> [ %start, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep = getelementptr inbounds nuw <4 x i32>, ptr %p, i64 %iv + %l = load <4 x i32>, ptr %gep, align 16 + %rdx.next = add <4 x i32> %rdx, %l + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret <4 x i32> %rdx.next +} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll index a5ac2cf46653d..fb1f2fcf5c190 100644 --- a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll +++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll @@ -220,6 +220,72 @@ exit: ret i32 %res } +define <4 x i32> @test_vector_add_reduction(ptr %a, i64 %n) { +; CHECK-LABEL: define <4 x i32> @test_vector_add_reduction( +; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1 +; CHECK-NEXT: br i1 [[TMP1]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]] +; CHECK: [[ENTRY_NEW]]: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[GEP_A]], align 16 +; CHECK-NEXT: [[RDX_NEXT]] = add <4 x i32> [[RDX]], [[TMP2]] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[GEP_A_1]], align 16 +; CHECK-NEXT: [[RDX_NEXT_1]] = add <4 x i32> [[RDX_1]], [[TMP3]] +; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT_UNR_LCSSA]]: +; CHECK-NEXT: [[RES_PH:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_UNR:%.*]] = phi <4 x i32> [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[RDX_NEXT_1]], [[RDX_NEXT]] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER]], label %[[EXIT:.*]] +; CHECK: [[LOOP_EPIL_PREHEADER]]: +; CHECK-NEXT: [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[RDX_EPIL_INIT:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[LCMP_MOD2:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[LCMP_MOD2]]) +; CHECK-NEXT: br label %[[LOOP_EPIL:.*]] +; CHECK: [[LOOP_EPIL]]: +; CHECK-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw <4 x i32>, ptr [[A]], i64 [[IV_EPIL_INIT]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[GEP_A_EPIL]], align 16 +; CHECK-NEXT: [[RDX_NEXT_EPIL:%.*]] = add <4 x i32> [[RDX_EPIL_INIT]], [[TMP4]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi <4 x i32> [ [[BIN_RDX]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ] +; CHECK-NEXT: ret <4 x i32> [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi <4 x i32> [ zeroinitializer, %entry ], [ %rdx.next, %loop ] + %gep.a = getelementptr inbounds nuw <4 x i32>, ptr %a, i64 %iv + %1 = load <4 x i32>, ptr %gep.a, align 16 + %rdx.next = add <4 x i32> %rdx, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop, !llvm.loop !0 + +exit: + %res = phi <4 x i32> [ %rdx.next, %loop ] + ret <4 x i32> %res +} !0 = distinct !{!0, !1} @@ -234,4 +300,5 @@ exit: ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} ;. diff --git a/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp b/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp index 08b4e8f40e3d0..0d4a5212c1f0c 100644 --- a/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/WaitingOnGraphTest.cpp @@ -399,6 +399,31 @@ TEST_F(WaitingOnGraphTest, Emit_TrivialSequence) { EXPECT_EQ(ER1.Failed.size(), 0U); } +TEST_F(WaitingOnGraphTest, Emit_SingleContainerSimpleCycle) { + // Test an emit of two nodes with a dependence cycle within a single + // container: + // N0: (0, 0) -> (0, 1) + // N1: (0, 1) -> (0, 0) + // We expect intra-simplify cycle elimination to clear both dependence sets, + // and coalescing to join them into one supernode covering both defs. + SuperNodeBuilder B; + ContainerElementsMap Defs0({{0, {0}}}); + ContainerElementsMap Deps0({{0, {1}}}); + B.add(Defs0, Deps0); + + auto ER0 = emit(TestGraph::simplify(B.takeSuperNodes())); + EXPECT_EQ(ER0.Ready.size(), 0U); + EXPECT_EQ(ER0.Failed.size(), 0U); + + ContainerElementsMap Defs1({{0, {1}}}); + ContainerElementsMap Deps1({{0, {0}}}); + B.add(Defs1, Deps1); + auto ER1 = emit(TestGraph::simplify(B.takeSuperNodes())); + + EXPECT_EQ(collapseDefs(ER1.Ready), merge(Defs0, Defs1)); + EXPECT_EQ(ER1.Failed.size(), 0U); +} + TEST_F(WaitingOnGraphTest, Emit_TrivialReverseSequence) { // Perform a sequence of two emits where the first emit depends on the // second. Check that both nodes become ready after the second emit. diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp index 2bea0ca2bfabb..8251c8983cc80 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp @@ -2922,6 +2922,14 @@ TreePattern::TreePattern(const Record *TheRec, const DagInit *Pat, bool isInput, Trees.push_back(ParseTreePattern(Pat, "")); } +TreePattern::TreePattern(const Record *TheRec, ArrayRef Args, + ArrayRef ArgNames, bool isInput, + CodeGenDAGPatterns &cdp) + : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), + Infer(*this) { + Trees.push_back(ParseRootlessTreePattern(Args, ArgNames)); +} + TreePattern::TreePattern(const Record *TheRec, TreePatternNodePtr Pat, bool isInput, CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false), @@ -2950,6 +2958,19 @@ void TreePattern::ComputeNamedNodes(TreePatternNode &N) { ComputeNamedNodes(Child); } +TreePatternNodePtr +TreePattern::ParseRootlessTreePattern(ArrayRef Args, + ArrayRef ArgNames) { + std::vector Children; + + for (auto [Arg, ArgName] : llvm::zip_equal(Args, ArgNames)) { + StringRef NameStr = ArgName ? ArgName->getValue() : ""; + Children.push_back(ParseTreePattern(Arg, NameStr)); + } + + return makeIntrusiveRefCnt(nullptr, std::move(Children), 1); +} + TreePatternNodePtr TreePattern::ParseTreePattern(const Init *TheInit, StringRef OpName) { RecordKeeper &RK = TheInit->getRecordKeeper(); @@ -3487,20 +3508,12 @@ void CodeGenDAGPatterns::ParseDefaultOperands() { ArrayRef DefaultOps = Records.getAllDerivedDefinitions("OperandWithDefaultOps"); - // Find some SDNode. - assert(!SDNodes.empty() && "No SDNodes parsed?"); - const Init *SomeSDNode = SDNodes.begin()->first->getDefInit(); - for (unsigned i = 0, e = DefaultOps.size(); i != e; ++i) { const DagInit *DefaultInfo = DefaultOps[i]->getValueAsDag("DefaultOps"); - // Clone the DefaultInfo dag node, changing the operator from 'ops' to - // SomeSDnode so that we can parse this. - const DagInit *DI = DagInit::get(SomeSDNode, DefaultInfo->getArgs(), - DefaultInfo->getArgNames()); - // Create a TreePattern to parse this. - TreePattern P(DefaultOps[i], DI, false, *this); + TreePattern P(DefaultOps[i], DefaultInfo->getArgs(), + DefaultInfo->getArgNames(), false, *this); assert(P.getNumTrees() == 1 && "This ctor can only produce one tree!"); // Copy the operands over into a DAGDefaultOperand. diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h index 633327e2f74e5..9a67933013c1c 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h @@ -917,6 +917,9 @@ class TreePattern { CodeGenDAGPatterns &ise); TreePattern(const Record *TheRec, const DagInit *Pat, bool isInput, CodeGenDAGPatterns &ise); + TreePattern(const Record *TheRec, ArrayRef Args, + ArrayRef ArgNames, bool isInput, + CodeGenDAGPatterns &ise); TreePattern(const Record *TheRec, TreePatternNodePtr Pat, bool isInput, CodeGenDAGPatterns &ise); @@ -981,6 +984,9 @@ class TreePattern { private: TreePatternNodePtr ParseTreePattern(const Init *DI, StringRef OpName); + TreePatternNodePtr + ParseRootlessTreePattern(ArrayRef Args, + ArrayRef ArgNames); void ComputeNamedNodes(); void ComputeNamedNodes(TreePatternNode &N); }; diff --git a/mlir/include/mlir/Dialect/X86Vector/CMakeLists.txt b/mlir/include/mlir/Dialect/X86Vector/CMakeLists.txt index 0fe01824b8248..bbe8e4eb892dd 100644 --- a/mlir/include/mlir/Dialect/X86Vector/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/X86Vector/CMakeLists.txt @@ -3,3 +3,5 @@ add_mlir_doc(X86Vector X86Vector Dialects/ -gen-dialect-doc -dialect=x86vector) add_mlir_interface(X86VectorInterfaces) add_dependencies(MLIRX86VectorIncGen MLIRX86VectorInterfacesIncGen) + +add_subdirectory(TransformOps) diff --git a/mlir/include/mlir/Dialect/X86Vector/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/X86Vector/TransformOps/CMakeLists.txt new file mode 100644 index 0000000000000..6f377e10fa8f8 --- /dev/null +++ b/mlir/include/mlir/Dialect/X86Vector/TransformOps/CMakeLists.txt @@ -0,0 +1,4 @@ +set(LLVM_TARGET_DEFINITIONS X86VectorTransformOps.td) +mlir_tablegen(X86VectorTransformOps.h.inc -gen-op-decls) +mlir_tablegen(X86VectorTransformOps.cpp.inc -gen-op-defs) +add_mlir_dialect_tablegen_target(MLIRX86VectorTransformOpsIncGen) diff --git a/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h new file mode 100644 index 0000000000000..e1d8b8762e799 --- /dev/null +++ b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h @@ -0,0 +1,31 @@ +//===- X86VectorTransformOps.h - X86Vector transform ops --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_X86VECTOR_TRANSFORMOPS_X86VECTORTRANSFORMOPS_H +#define MLIR_DIALECT_X86VECTOR_TRANSFORMOPS_X86VECTORTRANSFORMOPS_H + +#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" +#include "mlir/IR/OpImplementation.h" + +//===----------------------------------------------------------------------===// +// X86Vector Transform Operations +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h.inc" + +namespace mlir { +class DialectRegistry; + +namespace x86vector { +void registerTransformDialectExtension(DialectRegistry ®istry); + +} // namespace x86vector +} // namespace mlir + +#endif // MLIR_DIALECT_X86VECTOR_TRANSFORMOPS_X86VECTORTRANSFORMOPS_H diff --git a/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td new file mode 100644 index 0000000000000..3c5294ff14fc7 --- /dev/null +++ b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td @@ -0,0 +1,43 @@ +//===- X86VectorTransformOps.td - X86Vector transform ops --*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef X86VECTOR_TRANSFORM_OPS +#define X86VECTOR_TRANSFORM_OPS + +include "mlir/Dialect/Transform/IR/TransformDialect.td" +include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/OpBase.td" +include "mlir/Dialect/Transform/IR/TransformAttrs.td" +include "mlir/Dialect/Transform/IR/TransformTypes.td" +include "mlir/IR/RegionKindInterface.td" + +def ApplyVectorContractToFMAPatternsOp : Op]> { + let description = [{ + Collect patterns to lower a F32 type vector.contract operation to a FMA. + }]; + + let assemblyFormat = "attr-dict"; +} + +def ApplyVectorContractToPackedTypeDotProductPatternsOp : Op]> { + let description = [{ + Collect patterns to lower a BF16/Int8 type vector.contract operation + to a BF16/Int8 dot-product. + }]; + + let assemblyFormat = "attr-dict"; +} + + +#endif // X86VECTOR_TRANSFORM_OPS + diff --git a/mlir/include/mlir/Dialect/X86Vector/Transforms.h b/mlir/include/mlir/Dialect/X86Vector/Transforms.h index d54111ca41e69..fc46dff63c2b7 100644 --- a/mlir/include/mlir/Dialect/X86Vector/Transforms.h +++ b/mlir/include/mlir/Dialect/X86Vector/Transforms.h @@ -79,6 +79,18 @@ struct MaskHelper { } }; +//===----------------------------------------------------------------------===// + +// A set of patterns for specialized lowering of vector contraction +// operation to vector fused multiply and add (FMA) operation. +void populateVectorContractToFMAPatterns(RewritePatternSet &patterns); + +// A set of patterns for lowering 32-bit packed vector contraction operations +// to their corresponding packed-type dot-product operations, ultimately +// targeting the relevant x86 LLVM intrinsics (e.g., BF16 and Int8). +void populateVectorContractToPackedTypeDotProductPatterns( + RewritePatternSet &patterns); + //===----------------------------------------------------------------------===// /// Helpers extracted from: /// - clang/lib/Headers/avxintrin.h diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp index 1b4d1a42614ea..4358ef07da91d 100644 --- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp @@ -519,8 +519,13 @@ struct TransferReadLowering : public OpRewritePattern { return lowerToScatteredLoadOp(readOp, rewriter); } - // Perform common data transfer checks. VectorType vecTy = readOp.getVectorType(); + + // Lower using load.gather in 1D case + if (vecTy.getRank() == 1 && !readOp.hasOutOfBoundsDim()) + return lowerToScatteredLoadOp(readOp, rewriter); + + // Perform common data transfer checks. if (failed(storeLoadPreconditions(rewriter, readOp, vecTy))) return failure(); diff --git a/mlir/lib/Dialect/X86Vector/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/CMakeLists.txt index 9f57627c321fb..cb1e9d01821a2 100644 --- a/mlir/lib/Dialect/X86Vector/CMakeLists.txt +++ b/mlir/lib/Dialect/X86Vector/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(IR) add_subdirectory(Transforms) +add_subdirectory(TransformOps) diff --git a/mlir/lib/Dialect/X86Vector/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/TransformOps/CMakeLists.txt new file mode 100644 index 0000000000000..f4c9f8a05acbc --- /dev/null +++ b/mlir/lib/Dialect/X86Vector/TransformOps/CMakeLists.txt @@ -0,0 +1,17 @@ +add_mlir_dialect_library(MLIRX86VectorTransformOps + X86VectorTransformOps.cpp + + DEPENDS + MLIRX86VectorTransformOpsIncGen + + LINK_LIBS PUBLIC + MLIRIR + MLIRLLVMCommonConversion + MLIRLLVMDialect + MLIRVectorDialect + MLIRSideEffectInterfaces + MLIRTransformDialect + MLIRTransformDialectUtils + MLIRX86VectorDialect + MLIRX86VectorTransforms + ) diff --git a/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp b/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp new file mode 100644 index 0000000000000..95db208207672 --- /dev/null +++ b/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp @@ -0,0 +1,64 @@ +//===- X86VectorTransformOps.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/X86Vector/Transforms.h" +#include "mlir/Dialect/X86Vector/X86VectorDialect.h" + +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/RegionKindInterface.h" + +using namespace mlir; +using namespace mlir::x86vector; +using namespace mlir::transform; + +void mlir::transform::ApplyVectorContractToFMAPatternsOp::populatePatterns( + RewritePatternSet &patterns) { + x86vector::populateVectorContractToFMAPatterns(patterns); +} + +void mlir::transform::ApplyVectorContractToPackedTypeDotProductPatternsOp:: + populatePatterns(RewritePatternSet &patterns) { + x86vector::populateVectorContractToPackedTypeDotProductPatterns(patterns); +} + +//===----------------------------------------------------------------------===// +// Transform op registration +//===----------------------------------------------------------------------===// + +namespace { +class X86VectorTransformDialectExtension + : public transform::TransformDialectExtension< + X86VectorTransformDialectExtension> { +public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( + X86VectorTransformDialectExtension) + + X86VectorTransformDialectExtension() { + declareGeneratedDialect(); + declareGeneratedDialect(); + registerTransformOps< +#define GET_OP_LIST +#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp.inc" + >(); + } +}; +} // namespace + +#define GET_OP_CLASSES +#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp.inc" + +void mlir::x86vector::registerTransformDialectExtension( + DialectRegistry ®istry) { + registry.addExtensions(); +} diff --git a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt index c51266afe9e8f..2cab50fb591c4 100644 --- a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt @@ -1,11 +1,14 @@ add_mlir_dialect_library(MLIRX86VectorTransforms AVXTranspose.cpp LegalizeForLLVMExport.cpp + VectorContractToFMA.cpp + VectorContractToPackedTypeDotProduct.cpp LINK_LIBS PUBLIC MLIRArithDialect MLIRX86VectorDialect MLIRIR + MLIRLinalgDialect MLIRLLVMCommonConversion MLIRLLVMDialect MLIRVectorDialect diff --git a/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToFMA.cpp b/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToFMA.cpp new file mode 100644 index 0000000000000..f3af5ca167a35 --- /dev/null +++ b/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToFMA.cpp @@ -0,0 +1,143 @@ +//===- VectorContractToFMA.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Vector/Utils/VectorUtils.h" +#include "mlir/Dialect/X86Vector/Transforms.h" +#include "mlir/Dialect/X86Vector/X86VectorDialect.h" + +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Dominance.h" +#include "mlir/IR/PatternMatch.h" + +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +using namespace mlir; +using namespace mlir::vector; +using namespace mlir::x86vector; + +namespace { + +// Implements outer product contraction as a sequence of broadcast and +// FMA operations. +// +// For example - for F32 type: +// ``` +// vector.contract <1x1xf32>, <1x16xf32> into <1x16xf32> +// ``` +// to +// ``` +// vector.broadcast %lhs to <16xf32> +// vector.fma vector<16xf32> +// ``` +struct VectorContractToFMA : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(vector::ContractionOp contractOp, + PatternRewriter &rewriter) const override { + + if (contractOp.getKind() != vector::CombiningKind::ADD) + return rewriter.notifyMatchFailure(contractOp, + "Expects add combining kind."); + + VectorType lhsTy = contractOp.getLhsType(); + if (!lhsTy.getElementType().isF32()) + return rewriter.notifyMatchFailure(contractOp, + "Only F32 lowering is supported."); + + ArrayRef lhsShape = lhsTy.getShape(); + llvm::SmallVector nonUnitDimLhs; + llvm::copy_if(lhsShape, std::back_inserter(nonUnitDimLhs), + [](int64_t dim) { return dim != 1; }); + + VectorType rhsTy = contractOp.getRhsType(); + ArrayRef rhsShape = rhsTy.getShape(); + llvm::SmallVector nonUnitDimRhs; + llvm::copy_if(rhsShape, std::back_inserter(nonUnitDimRhs), + [](int64_t dim) { return dim != 1; }); + + if (nonUnitDimLhs.size() > 0 && nonUnitDimRhs.size() > 0) + return rewriter.notifyMatchFailure( + contractOp, "Excepts unit dimensions for either LHS or RHS shape."); + + if (nonUnitDimLhs.size() != 1 && nonUnitDimRhs.size() != 1) + return rewriter.notifyMatchFailure( + contractOp, + "Excepts a one non-unit A/B dimension for either LHS or RHS shape."); + + VectorType accTy = dyn_cast(contractOp.getAccType()); + if (!accTy) + return rewriter.notifyMatchFailure(contractOp, + "Accmulator is not a vector type"); + + if (!accTy.getElementType().isF32()) + return rewriter.notifyMatchFailure(contractOp, + "Accmulator should be F32 type."); + + ArrayRef accShape = accTy.getShape(); + llvm::SmallVector nonUnitDimAcc; + llvm::copy_if(accShape, std::back_inserter(nonUnitDimAcc), + [](int64_t dim) { return dim != 1; }); + if (nonUnitDimAcc.size() != 1) + return rewriter.notifyMatchFailure( + contractOp, "A or B dimension should be non-unit."); + + // Lowers vector.contract into a broadcast+FMA sequence. + auto loc = contractOp.getLoc(); + auto castAcc = vector::ShapeCastOp::create( + rewriter, loc, + VectorType::get(nonUnitDimAcc.front(), accTy.getElementType()), + contractOp.getAcc()); + + vector::FMAOp fma; + + // Broadcast the unit-dimension LHS or RHS to match the vector length of the + // corresponding non-unit dimension on the other operand. For example, + // if LHS has type vector<1x1xf32> and RHS has type vector<1x16xf32>, we + // broadcast the LHS to vector<1x16xf32>. In the opposite case (non-unit + // dimension on the LHS), we broadcast the RHS instead. + if (nonUnitDimRhs.size() > 0) { + auto castLhs = vector::ShapeCastOp::create( + rewriter, loc, VectorType::get(1, lhsTy.getElementType()), + contractOp.getLhs()); + auto castRhs = vector::ShapeCastOp::create( + rewriter, loc, + VectorType::get(nonUnitDimRhs.front(), rhsTy.getElementType()), + contractOp.getRhs()); + auto broadcastLhs = vector::BroadcastOp::create( + rewriter, loc, castRhs.getResult().getType(), castLhs); + fma = + vector::FMAOp::create(rewriter, loc, broadcastLhs, castRhs, castAcc); + } else { + auto castLhs = vector::ShapeCastOp::create( + rewriter, loc, + VectorType::get(nonUnitDimLhs.front(), lhsTy.getElementType()), + contractOp.getLhs()); + auto castRhs = vector::ShapeCastOp::create( + rewriter, loc, VectorType::get(1, rhsTy.getElementType()), + contractOp.getRhs()); + auto broadcastRhs = vector::BroadcastOp::create( + rewriter, loc, castLhs.getResult().getType(), castRhs); + fma = + vector::FMAOp::create(rewriter, loc, castLhs, broadcastRhs, castAcc); + } + + auto castFma = vector::ShapeCastOp::create(rewriter, loc, accTy, fma); + rewriter.replaceOp(contractOp, castFma); + + return success(); + } +}; + +} // namespace + +void x86vector::populateVectorContractToFMAPatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} diff --git a/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToPackedTypeDotProduct.cpp b/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToPackedTypeDotProduct.cpp new file mode 100644 index 0000000000000..1e64811db910b --- /dev/null +++ b/mlir/lib/Dialect/X86Vector/Transforms/VectorContractToPackedTypeDotProduct.cpp @@ -0,0 +1,301 @@ +//===- VectorContractToPackedTypeDotProduct.cpp ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Dialect/Vector/Utils/VectorUtils.h" +#include "mlir/Dialect/X86Vector/Transforms.h" +#include "mlir/Dialect/X86Vector/X86VectorDialect.h" + +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Dominance.h" +#include "mlir/IR/PatternMatch.h" + +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +using namespace mlir; +using namespace mlir::vector; +using namespace mlir::x86vector; + +namespace { + +static FailureOr> +inferIteratorsFromOutMap(AffineMap map) { + if (!map.isProjectedPermutation()) + return failure(); + SmallVector iterators( + map.getNumDims(), mlir::utils::IteratorType::reduction); + for (auto expr : map.getResults()) + if (auto dim = dyn_cast(expr)) + iterators[dim.getPosition()] = mlir::utils::IteratorType::parallel; + return iterators; +} + +// Returns true if the operation is in VNNI layout. +// Optionally, the check can be constrained to a specific VNNI blocking factor. +static bool isInVnniLayout(Operation *op, ArrayRef indexingMaps, + std::optional blockingFactor) { + // Narrow down type operations - VNNI only applies to contractions. + FailureOr dims = + linalg::inferContractionDims(indexingMaps); + if (failed(dims)) + return false; + + auto matA = op->getOperand(0); + auto matB = op->getOperand(1); + auto typeA = dyn_cast(matA.getType()); + auto typeB = dyn_cast(matB.getType()); + unsigned rankA = typeA.getRank(); + unsigned rankB = typeB.getRank(); + // VNNI format requires at least 1 parallel and 2 reduction dimensions. + if (rankA < 3 || rankB < 3) + return false; + + // At least two reduction dimensions are expected: + // one for the VNNI factor and one for the K dimension + if (dims->k.size() < 2) + return false; + + // Validate affine maps - VNNI computation should be defined by the two + // innermost reduction iterators. + // The input matrix dimensions layout must match the following: + // - matrix A - [...][K/vnniFactor][vnniFactor] + // - matrix B - [...][K/vnniFactor][N][vnniFactor] + auto maybeIters = inferIteratorsFromOutMap(indexingMaps[2]); + if (failed(maybeIters)) + return false; + SmallVector iteratorTypes = *maybeIters; + AffineMap mapA = indexingMaps[0]; + AffineMap mapB = indexingMaps[1]; + + auto vnniDimA = dyn_cast(mapA.getResult(rankA - 1)); + auto vnniDimB = dyn_cast(mapB.getResult(rankB - 1)); + if (!vnniDimA || !vnniDimB || vnniDimA != vnniDimB || + iteratorTypes[vnniDimA.getPosition()] != + mlir::utils::IteratorType::reduction) + return false; + auto redDimA = dyn_cast(mapA.getResult(rankA - 2)); + auto redDimB = dyn_cast(mapB.getResult(rankB - 3)); + if (!redDimA || !redDimB || redDimA != redDimB || + iteratorTypes[redDimA.getPosition()] != + mlir::utils::IteratorType::reduction) + return false; + auto parallelDimB = dyn_cast(mapB.getResult(rankB - 2)); + if (!parallelDimB || iteratorTypes[parallelDimB.getPosition()] != + mlir::utils::IteratorType::parallel) + return false; + + // VNNI factor must be: + // - the innermost inputs' dimension + // - statically known + // - multiple of 2 or equal to the specified factor + auto vnniDimSize = typeB.getShape().back(); + if (vnniDimSize == ShapedType::kDynamic || vnniDimSize == 0 || + vnniDimSize % 2 != 0) + return false; + if (typeA.getShape().back() != vnniDimSize) + return false; + if (blockingFactor && vnniDimSize != *blockingFactor) + return false; + + // The split reduction dimension size should also match. + if (typeA.getShape().end()[-2] != typeB.getShape().end()[-3]) + return false; + + return true; +} + +// Implements packed type outer product contraction as a sequence +// of broadcast and packed dot-product operations. +// +// For example - for F32 type: +// ``` +// vector.contract <1x1x2xbf16>, <1x16x2xbf16> into <1x16xf32> +// ``` +// to +// ``` +// vector.broadcast %lhs to <32xbf16> +// x86vector.avx512.dot vector<32xbf16> -> vector<16xf32> +// ``` +struct VectorContractToPackedTypeDotProduct + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(vector::ContractionOp contractOp, + PatternRewriter &rewriter) const override { + + if (contractOp.getKind() != vector::CombiningKind::ADD) + return rewriter.notifyMatchFailure(contractOp, + "Expects add combining kind."); + + VectorType lhsTy = contractOp.getLhsType(); + if (!lhsTy.getElementType().isBF16() && + !lhsTy.getElementType().isSignlessInteger(8)) + return rewriter.notifyMatchFailure( + contractOp, "Only BF16/Int8 lowering is supported."); + + unsigned int blockingFactor = lhsTy.getElementType().isBF16() ? 2 : 4; + if (!isInVnniLayout(contractOp.getOperation(), + contractOp.getIndexingMapsArray(), blockingFactor)) + return rewriter.notifyMatchFailure(contractOp, + "Input matrices not in VNNI format."); + + ArrayRef lhsShape = lhsTy.getShape(); + llvm::SmallVector nonUnitDimLhs; + llvm::copy_if(lhsShape, std::back_inserter(nonUnitDimLhs), + [](int64_t dim) { return dim != 1; }); + + VectorType rhsTy = contractOp.getRhsType(); + ArrayRef rhsShape = rhsTy.getShape(); + llvm::SmallVector nonUnitDimRhs; + llvm::copy_if(rhsShape, std::back_inserter(nonUnitDimRhs), + [](int64_t dim) { return dim != 1; }); + + if ((nonUnitDimLhs.size() - 1) > 0 && (nonUnitDimRhs.size() - 1) > 0) + return rewriter.notifyMatchFailure(contractOp, + "Excepts unit dimensions for either " + "LHS or RHS shape other than VNNI."); + + if ((nonUnitDimLhs.size() - 1) != 1 && (nonUnitDimRhs.size() - 1) != 1) + return rewriter.notifyMatchFailure( + contractOp, + "Excepts a one non-unit A/B dimension for either LHS or RHS shape."); + + VectorType accTy = dyn_cast(contractOp.getAccType()); + if (!accTy) + return rewriter.notifyMatchFailure(contractOp, "Wrong accmulator type."); + + if ((lhsTy.getElementType().isBF16() && !accTy.getElementType().isF32()) || + (lhsTy.getElementType().isSignlessInteger(8) && + !accTy.getElementType().isSignlessInteger(32))) + return rewriter.notifyMatchFailure(contractOp, + "Only F32 for BF16 or Int32 for Int8 " + "accumulation type is supported."); + + ArrayRef accShape = accTy.getShape(); + llvm::SmallVector nonUnitDimAcc; + llvm::copy_if(accShape, std::back_inserter(nonUnitDimAcc), + [](int64_t dim) { return dim != 1; }); + if (nonUnitDimAcc.size() != 1) + return rewriter.notifyMatchFailure( + contractOp, "A or B should be a non-unit dim in acc."); + + // Non-unit dimensions should match the vector length of BF16 or Int8 + // dot-product. + unsigned int nonUnitDim = nonUnitDimLhs.size() == 2 ? nonUnitDimLhs.front() + : nonUnitDimRhs.front(); + if (lhsTy.getElementType().isBF16() && nonUnitDim != 4 && nonUnitDim != 8 && + nonUnitDim != 16 && nonUnitDimAcc.front() == nonUnitDim) + return rewriter.notifyMatchFailure( + contractOp, "BF16 dot-product operation expects non-unit (LHR or " + "RHS) dim and acc dim of size 4/8/16."); + + if (lhsTy.getElementType().isSignlessInteger(8) && nonUnitDim != 4 && + nonUnitDim != 8 && nonUnitDimAcc.front() == nonUnitDim) + return rewriter.notifyMatchFailure( + contractOp, "Int8 dot-product operation expects non-unit (LHR or " + "RHS) dim and acc dim of size 4/8."); + + auto loc = contractOp.getLoc(); + auto castAcc = vector::ShapeCastOp::create( + rewriter, loc, + VectorType::get(nonUnitDimAcc.front(), accTy.getElementType()), + contractOp.getAcc()); + + Value dp; + + // Broadcast the unit-dimension LHS or RHS to match the vector length of the + // corresponding non-unit dimension on the other operand. For example, + // if LHS has type vector<1x1x2xbf16> and RHS has type vector<1x16x2xbf16>, + // we broadcast the LHS to vector<16x2xbf16>. In the opposite case (non-unit + // dimension on the LHS), we broadcast the RHS instead. + if ((nonUnitDimRhs.size() - 1) > 0) { + auto castRhs = vector::ShapeCastOp::create( + rewriter, loc, + VectorType::get(nonUnitDimRhs.front() * nonUnitDimRhs.back(), + rhsTy.getElementType()), + contractOp.getRhs()); + auto castLhs = vector::ShapeCastOp::create( + rewriter, loc, + VectorType::get(nonUnitDimLhs.front(), lhsTy.getElementType()), + contractOp.getLhs()); + auto bitcastLhs = vector::BitCastOp::create( + rewriter, loc, VectorType::get({1}, rewriter.getIntegerType(32)), + castLhs); + auto broadcastLhs = vector::BroadcastOp::create( + rewriter, loc, + VectorType::get({nonUnitDimRhs.front()}, rewriter.getIntegerType(32)), + bitcastLhs); + auto bitcastLhsPkType = vector::BitCastOp::create( + rewriter, loc, castRhs.getResult().getType(), broadcastLhs); + + if (lhsTy.getElementType().isBF16()) { + dp = x86vector::DotBF16Op::create( + rewriter, loc, + VectorType::get(nonUnitDimRhs.front(), rewriter.getF32Type()), + castAcc, bitcastLhsPkType, castRhs); + } + + if (lhsTy.getElementType().isSignlessInteger(8)) { + dp = x86vector::DotInt8Op::create( + rewriter, loc, + VectorType::get(nonUnitDimRhs.front(), rewriter.getIntegerType(32)), + castAcc, bitcastLhsPkType, castRhs); + } + } else { + auto castLhs = vector::ShapeCastOp::create( + rewriter, loc, + VectorType::get(nonUnitDimLhs.front() * nonUnitDimLhs.back(), + lhsTy.getElementType()), + contractOp.getLhs()); + auto castRhs = vector::ShapeCastOp::create( + rewriter, loc, + VectorType::get(nonUnitDimRhs.front(), rhsTy.getElementType()), + contractOp.getRhs()); + auto bitcastRhs = vector::BitCastOp::create( + rewriter, loc, VectorType::get({1}, rewriter.getIntegerType(32)), + castRhs); + auto broadcastRhs = vector::BroadcastOp::create( + rewriter, loc, + VectorType::get({nonUnitDimLhs.front()}, rewriter.getIntegerType(32)), + bitcastRhs); + auto bitcastRhsPkType = vector::BitCastOp::create( + rewriter, loc, castLhs.getResult().getType(), broadcastRhs); + + if (lhsTy.getElementType().isBF16()) { + dp = x86vector::DotBF16Op::create( + rewriter, loc, + VectorType::get(nonUnitDimLhs.front(), rewriter.getF32Type()), + castAcc, castLhs, bitcastRhsPkType); + } + + if (lhsTy.getElementType().isSignlessInteger(8)) { + dp = x86vector::DotInt8Op::create( + rewriter, loc, + VectorType::get(nonUnitDimLhs.front(), rewriter.getIntegerType(32)), + castAcc, castLhs, bitcastRhsPkType); + } + } + + if (!dp) + return failure(); + + auto castDp = vector::ShapeCastOp::create(rewriter, loc, accTy, dp); + rewriter.replaceOp(contractOp, castDp); + return success(); + } +}; + +} // namespace + +void x86vector::populateVectorContractToPackedTypeDotProductPatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 4455811a2e681..b64eb5b29ccb0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -989,9 +989,8 @@ struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern { SmallVector newOperands = llvm::map_to_vector( newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); - SmallVector newConstOffsets{matrixOp.getConstOffsets()}; - std::fill(newConstOffsets.begin(), newConstOffsets.end(), - ShapedType::kDynamic); + SmallVector newConstOffsets(matrixOp.getConstOffsets().size(), + ShapedType::kDynamic); DenseI64ArrayAttr newConstOffsetsAttr = rewriter.getDenseI64ArrayAttr(newConstOffsets); ValueRange currentOffsets = @@ -1066,9 +1065,8 @@ struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern { SmallVector newOperands = llvm::map_to_vector( newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); }); - SmallVector newConstOffsets{matrixOp.getConstOffsets()}; - std::fill(newConstOffsets.begin(), newConstOffsets.end(), - ShapedType::kDynamic); + SmallVector newConstOffsets(matrixOp.getConstOffsets().size(), + ShapedType::kDynamic); DenseI64ArrayAttr newConstOffsetsAttr = rewriter.getDenseI64ArrayAttr(newConstOffsets); ValueRange currentOffsets = diff --git a/mlir/lib/RegisterAllExtensions.cpp b/mlir/lib/RegisterAllExtensions.cpp index c857c38df717c..4312100a0c0b0 100644 --- a/mlir/lib/RegisterAllExtensions.cpp +++ b/mlir/lib/RegisterAllExtensions.cpp @@ -56,6 +56,7 @@ #include "mlir/Dialect/Transform/SMTExtension/SMTExtension.h" #include "mlir/Dialect/Transform/TuneExtension/TuneExtension.h" #include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h" +#include "mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h" #include "mlir/Dialect/XeGPU/TransformOps/XeGPUTransformOps.h" #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" @@ -113,6 +114,7 @@ void mlir::registerAllExtensions(DialectRegistry ®istry) { transform::registerSMTExtension(registry); transform::registerTuneExtension(registry); vector::registerTransformDialectExtension(registry); + x86vector::registerTransformDialectExtension(registry); xegpu::registerTransformDialectExtension(registry); arm_neon::registerTransformDialectExtension(registry); arm_sve::registerTransformDialectExtension(registry); diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp index 8d6d0764bae65..209c31d8dfcf3 100644 --- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp +++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp @@ -253,8 +253,8 @@ struct CppEmitter { return !fileId.empty() && file.getId() == fileId; } - /// Get expression currently being emitted. - ExpressionOp getEmittedExpression() { return emittedExpression; } + /// Is expression currently being emitted. + bool isEmittingExpression() { return emittedExpression; } /// Determine whether given value is part of the expression potentially being /// emitted. @@ -1718,7 +1718,7 @@ LogicalResult CppEmitter::emitGlobalVariable(GlobalOp op) { LogicalResult CppEmitter::emitAssignPrefix(Operation &op) { // If op is being emitted as part of an expression, bail out. - if (getEmittedExpression()) + if (isEmittingExpression()) return success(); switch (op.getNumResults()) { @@ -1800,7 +1800,7 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) { if (hasDeferredEmission(&op)) return success(); - if (getEmittedExpression() || + if (isEmittingExpression() || (isa(op) && shouldBeInlined(cast(op)))) return success(); diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir index c87a5304babfe..8bb272b1fe5fc 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir @@ -11,14 +11,15 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector // LOAD-ND-LABEL: @load_1D_vector( // LOAD-ND-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, -// LOAD-ND-SAME: %[[OFFSET:.+]]: index -// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[COLLAPSED]] -// LOAD-ND-SAME: memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32, -// LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32> -// LOAD-ND: return %[[VEC]] +// LOAD-ND: %[[CST:.+]] = arith.constant dense : vector<8xi1> +// LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex> +// LOAD-ND-COUNT2: arith.muli {{.*}} : index +// LOAD-ND-COUNT2: arith.addi {{.*}} : index +// LOAD-ND: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex> +// LOAD-ND: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex> +// LOAD-ND: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index +// LOAD-ND: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 +// LOAD-ND: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32> // LOAD-GATHER-LABEL: @load_1D_vector( // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<8x16x32xf32>, @@ -404,7 +405,7 @@ gpu.func @no_load_unsupported_map(%source: memref<16x32x64xf32>, // ----- gpu.module @xevm_module { -gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> { +gpu.func @load_from_subview_1D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8xf16> { %c0 = arith.constant 0.0 : f16 %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> %0 = vector.transfer_read %subview[%off2, %off2], %c0 @@ -412,19 +413,23 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: gpu.return %0 : vector<8xf16> } -// LOAD-ND-LABEL: @load_from_subview( +// LOAD-ND-LABEL: @load_from_subview_1D( // LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[CST:.+]] = arith.constant dense : vector<8xi1> // LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> -// LOAD-ND: %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0] -// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc -// LOAD-ND-SAME: %[[COLLAPSED]] -// LOAD-ND-SAME: memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16, -// LOAD-ND-SAME: boundary_check = false -// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16> -// LOAD-ND: return %[[VEC]] - -// LOAD-GATHER-LABEL: @load_from_subview( +// LOAD-ND: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref, index, index, index, index, index +// LOAD-ND: %[[STEP:.+]] = vector.step : vector<8xindex> +// LOAD-ND: arith.muli {{.*}} : index +// LOAD-ND: arith.addi %[[OFFSET]]{{.*}} : index +// LOAD-ND: arith.addi {{.*}} : index +// LOAD-ND: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8xindex> +// LOAD-ND: %[[IDX:.+]] = arith.addi %[[SPLAT]], %[[STEP]] : vector<8xindex> +// LOAD-ND: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index +// LOAD-ND: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 +// LOAD-ND: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16> + +// LOAD-GATHER-LABEL: @load_from_subview_1D( // LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, // LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index // LOAD-GATHER: %[[CST:.+]] = arith.constant dense : vector<8xi1> @@ -440,3 +445,42 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2: // LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 // LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16> } + +// ----- +gpu.module @xevm_module { +gpu.func @load_from_subview_2D(%source: memref<4096x4096xf16>, %off1: index, %off2: index) -> vector<8x16xf16> { + %c0 = arith.constant 0.0 : f16 + %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> + %0 = vector.transfer_read %subview[%off2, %off2], %c0 + {in_bounds = [true, true]} : memref<256x256xf16, strided<[4096, 1], offset: ?>>, vector<8x16xf16> + gpu.return %0 : vector<8x16xf16> +} + +// LOAD-ND-LABEL: @load_from_subview_2D( +// LOAD-ND-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, +// LOAD-ND-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-ND: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// LOAD-ND: %[[DESC:.+]] = xegpu.create_nd_tdesc +// LOAD-ND-SAME: %[[SUBVIEW]] +// LOAD-ND-SAME: memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf16, +// LOAD-ND-SAME: boundary_check = false +// LOAD-ND: %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]], %[[OFF2]]]{{.*}}-> vector<8x16xf16> +// LOAD-ND: return %[[VEC]] + +// LOAD-GATHER-LABEL: @load_from_subview_2D( +// LOAD-GATHER-SAME: %[[SRC:.+]]: memref<4096x4096xf16>, +// LOAD-GATHER-SAME: %[[OFF1:.+]]: index, %[[OFF2:.+]]: index +// LOAD-GATHER: %[[CST:.+]] = arith.constant dense : vector<8x16xi1> +// LOAD-GATHER: %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> +// LOAD-GATHER: %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref, index, index, index, index, index +// LOAD-GATHER-COUNT2: vector.step +// LOAD-GATHER-COUNT2: vector.shape_cast +// LOAD-GATHER-COUNT2: vector.broadcast +// LOAD-GATHER-COUNT2: arith.muli {{.*}} : index +// LOAD-GATHER-COUNT2: arith.addi {{.*}} : index +// LOAD-GATHER: %[[SPLAT:.+]] = vector.broadcast {{.*}}: index to vector<8x16xindex> +// LOAD-GATHER: %[[IDX:.+]] = arith.addi %[[SPLAT]], {{.*}} : vector<8x16xindex> +// LOAD-GATHER: %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index +// LOAD-GATHER: %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64 +// LOAD-GATHER: %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[IDX]]{{\]}}, %[[CST]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf16> +} diff --git a/mlir/test/Dialect/X86Vector/vector-contract-to-fma.mlir b/mlir/test/Dialect/X86Vector/vector-contract-to-fma.mlir new file mode 100644 index 0000000000000..e506b166d43ff --- /dev/null +++ b/mlir/test/Dialect/X86Vector/vector-contract-to-fma.mlir @@ -0,0 +1,344 @@ +// RUN: mlir-opt %s -transform-interpreter -cse -split-input-file | FileCheck %s + +!vecA = vector<1x1xf32> +!vecB = vector<1x64xf32> +!vecC = vector<1x64xf32> +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +func.func @matmul_outer_product_to_fma( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @matmul_outer_product_to_fma +// CHECK: vector.broadcast{{.*}}vector<1xf32> to vector<64xf32> +// CHECK: vector.fma{{.*}}vector<64xf32> +// CHECK: vector.shape_cast{{.*}}vector<64xf32> to vector<1x64xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<64x1xf32> +!vecB = vector<1x1xf32> +!vecC = vector<64x1xf32> +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +func.func @matmul_outer_product_to_fma_bcst_B( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @matmul_outer_product_to_fma_bcst_B +// CHECK: vector.broadcast +// CHECK: vector.fma{{.*}}vector<64xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1xf32> +!vecB = vector<1x1x64xf32> +!vecC = vector<1x1x64xf32> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +func.func @batch_matmul_to_fma( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @batch_matmul_to_fma +// CHECK: vector.broadcast +// CHECK: vector.fma{{.*}}vector<64xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x64x1xf32> +!vecB = vector<1x1x1xf32> +!vecC = vector<1x64x1xf32> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +func.func @batch_matmul_to_fma_bcst_B( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @batch_matmul_to_fma_bcst_B +// CHECK: vector.broadcast +// CHECK: vector.fma{{.*}}vector<64xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1xf32> +!vecB = vector<1x1x64xf32> +!vecC = vector<1x64xf32> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +func.func @brgemm_to_fma( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @brgemm_to_fma +// CHECK: vector.broadcast +// CHECK: vector.fma{{.*}}vector<64xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x64x1xf32> +!vecB = vector<1x1x1xf32> +!vecC = vector<64x1xf32> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +func.func @brgemm_to_fma_bcst_B( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @brgemm_to_fma_bcst_B +// CHECK: vector.broadcast +// CHECK: vector.fma{{.*}}vector<64xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<3x1x1xf32> +!vecB = vector<3x1x64xf32> +!vecC = vector<3x1x64xf32> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +func.func @negative_non_unit_batch_dim( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// Batch dimension should've been simplified earlier. + +// CHECK-LABEL: @negative_non_unit_batch_dim +// CHECK-NOT: vector.fma +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<3x1x1xf32> +!vecB = vector<3x1x64xf32> +!vecC = vector<1x64xf32> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +func.func @negative_non_unit_batch_reduce_dim( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// Batch-reduce dimension should've been simplified earlier. + +// CHECK-LABEL: @negative_non_unit_batch_reduce_dim +// CHECK-NOT: vector.fma +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1xf32> +!vecB = vector<1x64xf32> +!vecC = vector<1x64xf32> +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +func.func @negative_invalid_kind( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_invalid_kind +// CHECK-NOT: vector.fma +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1xf32> +!vecB = vector<1x1x64xf32> +!vecC = vector<1x1x64xi32> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +func.func @negative_accumulator_type( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_accumulator_type +// CHECK-NOT: vector.fma +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_fma + } : !transform.any_op + transform.yield + } +} diff --git a/mlir/test/Dialect/X86Vector/vector-contract-to-packed-type-dotproduct.mlir b/mlir/test/Dialect/X86Vector/vector-contract-to-packed-type-dotproduct.mlir new file mode 100644 index 0000000000000..65676cbae772c --- /dev/null +++ b/mlir/test/Dialect/X86Vector/vector-contract-to-packed-type-dotproduct.mlir @@ -0,0 +1,681 @@ +// RUN: mlir-opt %s -transform-interpreter -cse -split-input-file | FileCheck %s + +!vecA = vector<1x1x1x2xbf16> +!vecB = vector<1x1x16x2xbf16> +!vecC = vector<1x16xf32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)> +func.func @brgemm_to_bf16dp( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @brgemm_to_bf16dp +// CHECK: vector.broadcast +// CHECK: x86vector.avx512.dot + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x16x1x2xbf16> +!vecB = vector<1x1x1x2xbf16> +!vecC = vector<16x1xf32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)> +func.func @brgemm_to_bf16dp_bcst_B( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @brgemm_to_bf16dp_bcst_B +// CHECK: vector.broadcast +// CHECK: x86vector.avx512.dot + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x4xi8> +!vecB = vector<1x1x8x4xi8> +!vecC = vector<1x8xi32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)> +func.func @brgemm_to_int8dp( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @brgemm_to_int8dp +// CHECK: vector.broadcast +// CHECK: x86vector.avx.dot.i8 + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x2xbf16> +!vecB = vector<1x1x16x2xbf16> +!vecC = vector<1x1x16xf32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @batch_matmul_bf16dp( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @batch_matmul_bf16dp +// CHECK: vector.broadcast +// CHECK: x86vector.avx512.dot + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x4xi8> +!vecB = vector<1x1x8x4xi8> +!vecC = vector<1x1x8xi32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @batch_matmul_int8dp( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + + +// CHECK-LABEL: @batch_matmul_int8dp +// CHECK: vector.broadcast +// CHECK: x86vector.avx.dot.i8 + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x8x1x4xi8> +!vecB = vector<1x1x1x4xi8> +!vecC = vector<1x8x1xi32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @batch_matmul_int8dp_bcst_B( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + + +// CHECK-LABEL: @batch_matmul_int8dp_bcst_B +// CHECK: vector.broadcast +// CHECK: x86vector.avx.dot.i8 + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x2xbf16> +!vecB = vector<1x16x2xbf16> +!vecC = vector<1x16xf32> +#map = affine_map<(d4, d1, d2, d3) -> (d1, d3, d4)> +#map1 = affine_map<(d4, d1, d2, d3) -> (d3, d2, d4)> +#map2 = affine_map<(d4, d1, d2, d3) -> (d1, d2)> +func.func @matmul_outer_product_to_bf16dp( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @matmul_outer_product_to_bf16dp +// CHECK: vector.broadcast +// CHECK: x86vector.avx512.dot + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<16x1x2xbf16> +!vecB = vector<1x1x2xbf16> +!vecC = vector<16x1xf32> +#map = affine_map<(d4, d1, d2, d3) -> (d1, d3, d4)> +#map1 = affine_map<(d4, d1, d2, d3) -> (d3, d2, d4)> +#map2 = affine_map<(d4, d1, d2, d3) -> (d1, d2)> +func.func @matmul_outer_product_to_bf16dp_bcst_B( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @matmul_outer_product_to_bf16dp_bcst_B +// CHECK: vector.broadcast +// CHECK: x86vector.avx512.dot + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x4xi8> +!vecB = vector<1x8x4xi8> +!vecC = vector<1x8xi32> +#map = affine_map<(d4, d1, d2, d3) -> (d1, d3, d4)> +#map1 = affine_map<(d4, d1, d2, d3) -> (d3, d2, d4)> +#map2 = affine_map<(d4, d1, d2, d3) -> (d1, d2)> +func.func @matmul_outer_product_to_int8dp( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @matmul_outer_product_to_int8dp +// CHECK: vector.broadcast +// CHECK: x86vector.avx.dot.i8 + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x2xbf16> +!vecB = vector<1x16x2xbf16> +!vecC = vector<1x16xf32> +#map = affine_map<(d4, d1, d2, d3) -> (d1, d3, d4)> +#map1 = affine_map<(d4, d1, d2, d3) -> (d3, d2, d4)> +#map2 = affine_map<(d4, d1, d2, d3) -> (d1, d2)> +func.func @negative_invalid_vc_kind( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_invalid_vc_kind +// CHECK-NOT: x86vector.avx512.dot +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x4xbf16> +!vecB = vector<1x1x16x4xbf16> +!vecC = vector<1x16xf32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)> +func.func @negative_false_vnni_bf16( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_false_vnni_bf16 +// CHECK-NOT: x86vector.avx512.dot +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x2xi8> +!vecB = vector<1x1x8x2xi8> +!vecC = vector<1x8xi32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)> +func.func @negative_false_vnni_int8( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_false_vnni_int8 +// CHECK-NOT: x86vector.avx.dot.i8 +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<3x1x1x2xbf16> +!vecB = vector<3x1x16x2xbf16> +!vecC = vector<3x1x16xf32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @negative_batch_dimension( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_batch_dimension +// CHECK-NOT: x86vector.avx512.dot +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<2x1x1x4xi8> +!vecB = vector<2x1x8x4xi8> +!vecC = vector<1x8xi32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d1, d2)> +func.func @negative_brgemm_dimension( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_brgemm_dimension +// CHECK-NOT: x86vector.avx.dot.i8 +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x2xbf16> +!vecB = vector<1x1x16x2xbf16> +!vecC = vector<1x1x16xbf16> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @negative_float_acc_type( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_float_acc_type +// CHECK-NOT: x86vector.avx512.dot +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x4xi8> +!vecB = vector<1x1x8x4xi8> +!vecC = vector<1x1x8xi8> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @negative_int_acc_type( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_int_acc_type +// CHECK-NOT: x86vector.avx.dot.i8 +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x4xbf16> +!vecB = vector<1x1x16x4xbf16> +!vecC = vector<1x1x16xbf16> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @negative_wrong_vnni_blocking_factor_bf16( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_wrong_vnni_blocking_factor_bf16 +// CHECK-NOT: x86vector.avx512.dot +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1xbf16> +!vecB = vector<1x1x32xbf16> +!vecC = vector<1x32xf32> +#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> +#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> +#map2 = affine_map<(d0, d1, d2, d3) -> (d1, d2)> +func.func @negative_brgemm_not_vnni( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_brgemm_not_vnni +// CHECK-NOT: x86vector.avx512.dot +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x4xi8> +!vecB = vector<1x1x16x4xi8> +!vecC = vector<1x1x16xi32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @negative_wrong_vector_shape_int8( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_wrong_vector_shape_int8 +// CHECK-NOT: x86vector.avx.dot.i8 +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} + +// ----- + +!vecA = vector<1x1x1x2xbf16> +!vecB = vector<1x1x32x2xbf16> +!vecC = vector<1x1x32xf32> +#map = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d3, d4)> +#map1 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d3, d2, d4)> +#map2 = affine_map<(d0, d4, d1, d2, d3) -> (d0, d1, d2)> +func.func @negative_wrong_vector_shape_bf16( + %arg0: !vecA, %arg1: !vecB, %arg2: !vecC) -> !vecC +{ + %0 = vector.contract { + indexing_maps = [#map, #map1, #map2], + iterator_types = ["parallel", "reduction", "parallel", "parallel", "reduction"], + kind = #vector.kind} + %arg0, %arg1, %arg2 + : !vecA, !vecB into !vecC + return %0 : !vecC +} + +// CHECK-LABEL: @negative_wrong_vector_shape_bf16 +// CHECK-NOT: x86vector.avx512.dot +// CHECK: vector.contract + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %func = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.x86vector.vector_contract_to_packed_type_dot_product + } : !transform.any_op + transform.yield + } +} diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 1421ec553f251..6d2eedbfe2415 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -2578,6 +2578,50 @@ cc_library( ], ) +td_library( + name = "X86VectorTransformOpsTdFiles", + srcs = [ + "include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td", + ], + includes = ["include"], + deps = [ + ":OpBaseTdFiles", + ":SideEffectInterfacesTdFiles", + ":TransformDialectTdFiles", + ], +) + +gentbl_cc_library( + name = "X86VectorTransformOpsIncGen", + tbl_outs = { + "include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp.inc": ["-gen-op-defs"], + }, + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td", + deps = [ + ":X86VectorTransformOpsTdFiles", + ], +) + +cc_library( + name = "X86VectorTransformOps", + srcs = glob(["lib/Dialect/X86Vector/TransformOps/*.cpp"]), + hdrs = glob(["include/mlir/Dialect/X86Vector/TransformOps/*.h"]), + includes = ["include"], + deps = [ + ":IR", + ":LLVMCommonConversion", + ":LLVMDialect", + ":TransformDialect", + ":TransformDialectInterfaces", + ":VectorDialect", + ":X86VectorDialect", + ":X86VectorTransformOpsIncGen", + ":X86VectorTransforms", + ], +) + cc_library( name = "X86VectorTransforms", srcs = glob(["lib/Dialect/X86Vector/Transforms/*.cpp"]), @@ -2588,6 +2632,10 @@ cc_library( ":IR", ":LLVMCommonConversion", ":LLVMDialect", + ":LinalgDialect", + ":LinalgInterfaces", + ":Pass", + ":TransformUtils", ":VectorDialect", ":VectorUtils", ":X86VectorDialect", @@ -9571,6 +9619,7 @@ cc_library( ":UBToLLVM", ":VectorToLLVM", ":VectorTransformOps", + ":X86VectorTransformOps", ":XeGPUTransformOps", ":XeVMToLLVM", ":XeVMToLLVMIRTranslation",